diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 378f4f5d3..4b93fcffd 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -56,33 +56,38 @@ using SparseArrays, Krylov, LinearOperators
 using CUDA, CUDA.CUSPARSE
 
 # Transfer the linear system from the CPU to the GPU
-A_gpu = CuSparseMatrixCSC(A_cpu)  # A_gpu = CuSparseMatrixCSR(A_cpu)
+A_gpu = CuSparseMatrixCSR(A_cpu)  # A_gpu = CuSparseMatrixCSC(A_cpu)
 b_gpu = CuVector(b_cpu)
 
-# LLᴴ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
+# Incomplete decomposition LLᴴ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
 P = ic02(A_gpu, 'O')
 
+# Additional vector required for solving triangular systems
+n = length(b_gpu)
+T = eltype(b_gpu)
+z = similar(CuVector{T}, n)
+
 # Solve Py = x
-function ldiv_ic0!(y, P, x)
-  copyto!(y, x)                        # Variant for CuSparseMatrixCSR
-  sv2!('T', 'U', 'N', 1.0, P, y, 'O')  # sv2!('N', 'L', 'N', 1.0, P, y, 'O')
-  sv2!('N', 'U', 'N', 1.0, P, y, 'O')  # sv2!('T', 'L', 'N', 1.0, P, y, 'O')
+function ldiv_ic0!(P::CuSparseMatrixCSR, x, y, z)
+  ldiv!(z, LowerTriangular(P), x)   # Forward substitution with L
+  ldiv!(y, LowerTriangular(P)', z)  # Backward substitution with Lᴴ
+  return y
+end
+
+function ldiv_ic0!(P::CuSparseMatrixCSC, x, y, z)
+  ldiv!(z, UpperTriangular(P)', x)  # Forward substitution with L
+  ldiv!(y, UpperTriangular(P), z)   # Backward substitution with Lᴴ
   return y
 end
 
 # Operator that model P⁻¹
-n = length(b_gpu)
-T = eltype(b_gpu)
 symmetric = hermitian = true
-opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(y, P, x))
+opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z))
 
-# Solve a symmetric positive definite system with an incomplete Cholesky preconditioner on GPU
+# Solve an Hermitian positive definite system with an incomplete Cholesky preconditioner on GPU
 x, stats = cg(A_gpu, b_gpu, M=opM)
 ```
 
-!!! note
-    You need to replace `'T'` by `'C'` in `ldiv_ic0!` if `A_gpu` is a complex matrix.
-
 ### Example with a general square system
 
 ```julia
@@ -96,27 +101,35 @@ A_cpu = A_cpu[p,:]
 b_cpu = b_cpu[p]
 
 # Transfer the linear system from the CPU to the GPU
-A_gpu = CuSparseMatrixCSC(A_cpu)  # A_gpu = CuSparseMatrixCSR(A_cpu)
+A_gpu = CuSparseMatrixCSR(A_cpu)  # A_gpu = CuSparseMatrixCSC(A_cpu)
 b_gpu = CuVector(b_cpu)
 
-# LU ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
+# Incomplete decomposition LU ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
 P = ilu02(A_gpu, 'O')
 
+# Additional vector required for solving triangular systems
+n = length(b_gpu)
+T = eltype(b_gpu)
+z = similar(CuVector{T}, n)
+
 # Solve Py = x
-function ldiv_ilu0!(y, P, x)
-  copyto!(y, x)                        # Variant for CuSparseMatrixCSR
-  sv2!('N', 'L', 'N', 1.0, P, y, 'O')  # sv2!('N', 'L', 'U', 1.0, P, y, 'O')
-  sv2!('N', 'U', 'U', 1.0, P, y, 'O')  # sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+function ldiv_ilu0!(P::CuSparseMatrixCSR, x, y, z)
+  ldiv!(z, UnitLowerTriangular(P), x)  # Forward substitution with L
+  ldiv!(y, UpperTriangular(P), z)      # Backward substitution with U
+  return y
+end
+
+function ldiv_ilu0!(P::CuSparseMatrixCSC, x, y, z)
+  ldiv!(z, LowerTriangular(P), x)      # Forward substitution with L
+  ldiv!(y, UnitUpperTriangular(P), z)  # Backward substitution with U
   return y
 end
 
 # Operator that model P⁻¹
-n = length(b_gpu)
-T = eltype(b_gpu)
 symmetric = hermitian = false
-opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(y, P, x))
+opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
 
-# Solve an unsymmetric system with an incomplete LU preconditioner on GPU
+# Solve a non-Hermitian system with an incomplete LU preconditioner on GPU
 x, stats = bicgstab(A_gpu, b_gpu, M=opM)
 ```
 
@@ -167,8 +180,8 @@ b_gpu = oneVector(b_cpu)
 x, stats = lsqr(A_gpu, b_gpu)
 ```
 
-!!! warning
-    The library `oneMKL` is not interfaced yet in oneAPI.jl and all BLAS routines (dot, norm, mul!, etc.) dispatch to generic fallbacks.
+!!! note
+    The library `oneMKL` is interfaced in oneAPI.jl and accelerate linear algebra operations on Intel GPUs. Only dense linear systems are supported because sparse linear algebra routines are not interfaced yet.
 
 ## Apple M1 GPUs
 
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
index 908a2819c..4a2428b82 100644
--- a/test/gpu/nvidia.jl
+++ b/test/gpu/nvidia.jl
@@ -26,31 +26,32 @@ include("gpu.jl")
       b_gpu = CuVector(b_cpu)
       n = length(b_gpu)
       T = eltype(b_gpu)
+      z = similar(CuVector{T}, n)
       symmetric = hermitian = true
 
       A_gpu = CuSparseMatrixCSC(A_cpu)
       P = ic02(A_gpu, 'O')
-      function ldiv_csc_ic0!(y, P, x)
-        copyto!(y, x)
-        sv2!('T', 'U', 'N', 1.0, P, y, 'O')
-        sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+      function ldiv_ic0!(P::CuSparseMatrixCSC, x, y, z)
+        ldiv!(z, UpperTriangular(P)', x)
+        ldiv!(y, UpperTriangular(P), z)
         return y
       end
-      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csc_ic0!(y, P, x))
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z))
       x, stats = cg(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+      @test stats.niter ≤ 38
 
       A_gpu = CuSparseMatrixCSR(A_cpu)
       P = ic02(A_gpu, 'O')
-      function ldiv_csr_ic0!(y, P, x)
-        copyto!(y, x)
-        sv2!('N', 'L', 'N', 1.0, P, y, 'O')
-        sv2!('T', 'L', 'N', 1.0, P, y, 'O')
+      function ldiv_ic0!(P::CuSparseMatrixCSR, x, y, z)
+        ldiv!(z, LowerTriangular(P), x)
+        ldiv!(y, LowerTriangular(P)', z)
         return y
       end
-      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csr_ic0!(y, P, x))
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z))
       x, stats = cg(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+      @test stats.niter ≤ 38
     end
 
     @testset "ilu0" begin
@@ -64,31 +65,32 @@ include("gpu.jl")
       b_gpu = CuVector(b_cpu)
       n = length(b_gpu)
       T = eltype(b_gpu)
+      z = similar(CuVector{T}, n)
       symmetric = hermitian = false
 
       A_gpu = CuSparseMatrixCSC(A_cpu)
       P = ilu02(A_gpu, 'O')
-      function ldiv_csc_ilu0!(y, P, x)
-        copyto!(y, x)
-        sv2!('N', 'L', 'N', 1.0, P, y, 'O')
-        sv2!('N', 'U', 'U', 1.0, P, y, 'O')
+      function ldiv_ilu0!(P::CuSparseMatrixCSC, x, y, z)
+        ldiv!(z, LowerTriangular(P), x)
+        ldiv!(y, UnitUpperTriangular(P), z)
         return y
       end
-      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csc_ilu0!(y, P, x))
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
       x, stats = bicgstab(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+      @test stats.niter ≤ 1659
 
       A_gpu = CuSparseMatrixCSR(A_cpu)
       P = ilu02(A_gpu, 'O')
-      function ldiv_csr_ilu0!(y, P, x)
-        copyto!(y, x)
-        sv2!('N', 'L', 'U', 1.0, P, y, 'O')
-        sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+      function ldiv_ilu0!(P::CuSparseMatrixCSR, x, y, z)
+        ldiv!(z, UnitLowerTriangular(P), x)
+        ldiv!(y, UpperTriangular(P), z)
         return y
       end
-      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csr_ilu0!(y, P, x))
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
       x, stats = bicgstab(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+      @test stats.niter ≤ 1659
     end
   end