From fcbf1894916506cc3e0bc07d04977bb75207ea67 Mon Sep 17 00:00:00 2001
From: Alexis Montoison <alexis.montoison@polymtl.ca>
Date: Fri, 24 Mar 2023 14:08:02 -0400
Subject: [PATCH] Fix the documentation for CUDA.jl 4.1

---
 .buildkite/pipeline.yml |  2 ++
 docs/src/gpu.md         | 20 +++++++++++------
 test/gpu/nvidia.jl      | 49 ++++++++++++++++++++++++-----------------
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0943f7c21..6e28f623d 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -12,6 +12,8 @@ steps:
       Pkg.add("CUDA")
       Pkg.add("LinearOperators")
       Pkg.instantiate()
+      using CUDA
+      CUDA.set_runtime_version!(v"11.8")
       include("test/gpu/nvidia.jl")'
     timeout_in_minutes: 30
 
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 8e95eb84f..3307790b1 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -65,7 +65,7 @@ if CUDA.functional()
   b_gpu = CuVector(b_cpu)
 
   # IC(0) decomposition LLᴴ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
-  P = ic02(A_gpu, 'O')
+  P = ic02(A_gpu)
 
   # Additional vector required for solving triangular systems
   n = length(b_gpu)
@@ -101,18 +101,17 @@ using SparseArrays, Krylov, LinearOperators
 using CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER
 
 if CUDA.functional()
-  # Optional -- Compute a permutation vector p such that A[p,:] has no zero diagonal
-  p = zfd(A_cpu, 'O')
+  # Optional -- Compute a permutation vector p such that A[:,p] has no zero diagonal
+  p = zfd(A_cpu)
   p .+= 1
-  A_cpu = A_cpu[p,:]
-  b_cpu = b_cpu[p]
+  A_cpu = A_cpu[:,p]
 
   # Transfer the linear system from the CPU to the GPU
   A_gpu = CuSparseMatrixCSR(A_cpu)  # A_gpu = CuSparseMatrixCSC(A_cpu)
   b_gpu = CuVector(b_cpu)
 
   # ILU(0) decomposition LU ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
-  P = ilu02(A_gpu, 'O')
+  P = ilu02(A_gpu)
 
   # Additional vector required for solving triangular systems
   n = length(b_gpu)
@@ -137,10 +136,17 @@ if CUDA.functional()
   opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
 
   # Solve a non-Hermitian system with an ILU(0) preconditioner on GPU
-  x, stats = bicgstab(A_gpu, b_gpu, M=opM)
+  x̄, stats = bicgstab(A_gpu, b_gpu, M=opM)
+
+  # Recover the solution of Ax = b with the solution of A[:,p]x̄ = b
+  invp = invperm(p)
+  x = x̄[invp]
 end
 ```
 
+!!! note
+    The `CUSPARSE` library of CUDA toolkits `v"12.x"` has some bugs. We recommend to use an older CUDA toolkit with `CUDA.set_runtime_version!(v"11.8")`.
+
 ## AMD GPUs
 
 All solvers in Krylov.jl can be used with [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) and allow computations on AMD GPUs.
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
index 4a2428b82..b03dcd6ab 100644
--- a/test/gpu/nvidia.jl
+++ b/test/gpu/nvidia.jl
@@ -22,6 +22,7 @@ include("gpu.jl")
 
     @testset "ic0" begin
       A_cpu, b_cpu = sparse_laplacian()
+      @test mapreduce(Aᵢᵢ -> Aᵢᵢ != 0, &, diag(A_cpu)) == true
 
       b_gpu = CuVector(b_cpu)
       n = length(b_gpu)
@@ -30,7 +31,7 @@ include("gpu.jl")
       symmetric = hermitian = true
 
       A_gpu = CuSparseMatrixCSC(A_cpu)
-      P = ic02(A_gpu, 'O')
+      P = ic02(A_gpu)
       function ldiv_ic0!(P::CuSparseMatrixCSC, x, y, z)
         ldiv!(z, UpperTriangular(P)', x)
         ldiv!(y, UpperTriangular(P), z)
@@ -39,10 +40,10 @@ include("gpu.jl")
       opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z))
       x, stats = cg(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
-      @test stats.niter ≤ 38
+      @test stats.niter ≤ 19
 
-      A_gpu = CuSparseMatrixCSR(A_cpu)
-      P = ic02(A_gpu, 'O')
+      A_gpu = CuSparseMatrixCSR(A_gpu)
+      P = ic02(A_gpu)
       function ldiv_ic0!(P::CuSparseMatrixCSR, x, y, z)
         ldiv!(z, LowerTriangular(P), x)
         ldiv!(y, LowerTriangular(P)', z)
@@ -51,16 +52,22 @@ include("gpu.jl")
       opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z))
       x, stats = cg(A_gpu, b_gpu, M=opM)
       @test norm(b_gpu - A_gpu * x) ≤ 1e-6
-      @test stats.niter ≤ 38
+      @test stats.niter ≤ 19
     end
 
     @testset "ilu0" begin
-      A_cpu, b_cpu = polar_poisson()
-
-      p = zfd(A_cpu, 'O')
+      A_cpu = Float64[1  0 0  4;
+                      0  0 7  8;
+                      9  0 0 12;
+                      0 14 0 16]
+      A_cpu = sparse(A_cpu)
+      b_cpu = ones(4)
+      @test mapreduce(Aᵢᵢ -> Aᵢᵢ != 0, &, diag(A_cpu)) == false
+
+      p = zfd(A_cpu)
       p .+= 1
-      A_cpu = A_cpu[p,:]
-      b_cpu = b_cpu[p]
+      invp = invperm(p)
+      @test reduce(&, invp .== p) == false
 
       b_gpu = CuVector(b_cpu)
       n = length(b_gpu)
@@ -68,29 +75,31 @@ include("gpu.jl")
       z = similar(CuVector{T}, n)
       symmetric = hermitian = false
 
-      A_gpu = CuSparseMatrixCSC(A_cpu)
-      P = ilu02(A_gpu, 'O')
+      A_gpu = CuSparseMatrixCSC(A_cpu[:,p])
+      P = ilu02(A_gpu)
       function ldiv_ilu0!(P::CuSparseMatrixCSC, x, y, z)
         ldiv!(z, LowerTriangular(P), x)
         ldiv!(y, UnitUpperTriangular(P), z)
         return y
       end
       opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
-      x, stats = bicgstab(A_gpu, b_gpu, M=opM)
-      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
-      @test stats.niter ≤ 1659
+      x̄, stats = gmres(A_gpu, b_gpu, M=opM)
+      x = Vector(x̄)[invp]
+      @test norm(b_gpu - A_gpu * x̄) ≤ 1e-6
+      @test norm(b_cpu - A_cpu * x) ≤ 1e-6
 
-      A_gpu = CuSparseMatrixCSR(A_cpu)
-      P = ilu02(A_gpu, 'O')
+      A_gpu = CuSparseMatrixCSR(A_cpu[:,p])
+      P = ilu02(A_gpu)
       function ldiv_ilu0!(P::CuSparseMatrixCSR, x, y, z)
         ldiv!(z, UnitLowerTriangular(P), x)
         ldiv!(y, UpperTriangular(P), z)
         return y
       end
       opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z))
-      x, stats = bicgstab(A_gpu, b_gpu, M=opM)
-      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
-      @test stats.niter ≤ 1659
+      x̄, stats = gmres(A_gpu, b_gpu, M=opM)
+      x = Vector(x̄)[invp]
+      @test norm(b_gpu - A_gpu * x̄) ≤ 1e-6
+      @test norm(b_cpu - A_cpu * x) ≤ 1e-6
     end
   end