From fcbf1894916506cc3e0bc07d04977bb75207ea67 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Fri, 24 Mar 2023 14:08:02 -0400 Subject: [PATCH] Fix the documentation for CUDA.jl 4.1 --- .buildkite/pipeline.yml | 2 ++ docs/src/gpu.md | 20 +++++++++++------ test/gpu/nvidia.jl | 49 ++++++++++++++++++++++++----------------- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0943f7c21..6e28f623d 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -12,6 +12,8 @@ steps: Pkg.add("CUDA") Pkg.add("LinearOperators") Pkg.instantiate() + using CUDA + CUDA.set_runtime_version!(v"11.8") include("test/gpu/nvidia.jl")' timeout_in_minutes: 30 diff --git a/docs/src/gpu.md b/docs/src/gpu.md index 8e95eb84f..3307790b1 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -65,7 +65,7 @@ if CUDA.functional() b_gpu = CuVector(b_cpu) # IC(0) decomposition LLᴴ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices - P = ic02(A_gpu, 'O') + P = ic02(A_gpu) # Additional vector required for solving triangular systems n = length(b_gpu) @@ -101,18 +101,17 @@ using SparseArrays, Krylov, LinearOperators using CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER if CUDA.functional() - # Optional -- Compute a permutation vector p such that A[p,:] has no zero diagonal - p = zfd(A_cpu, 'O') + # Optional -- Compute a permutation vector p such that A[:,p] has no zero diagonal + p = zfd(A_cpu) p .+= 1 - A_cpu = A_cpu[p,:] - b_cpu = b_cpu[p] + A_cpu = A_cpu[:,p] # Transfer the linear system from the CPU to the GPU A_gpu = CuSparseMatrixCSR(A_cpu) # A_gpu = CuSparseMatrixCSC(A_cpu) b_gpu = CuVector(b_cpu) # ILU(0) decomposition LU ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices - P = ilu02(A_gpu, 'O') + P = ilu02(A_gpu) # Additional vector required for solving triangular systems n = length(b_gpu) @@ -137,10 +136,17 @@ if CUDA.functional() opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z)) # Solve a non-Hermitian system with an ILU(0) preconditioner on GPU - x, stats = bicgstab(A_gpu, b_gpu, M=opM) + x̄, stats = bicgstab(A_gpu, b_gpu, M=opM) + + # Recover the solution of Ax = b with the solution of A[:,p]x̄ = b + invp = invperm(p) + x = x̄[invp] end ``` +!!! note + The `CUSPARSE` library of CUDA toolkits `v"12.x"` has some bugs. We recommend to use an older CUDA toolkit with `CUDA.set_runtime_version!(v"11.8")`. + ## AMD GPUs All solvers in Krylov.jl can be used with [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) and allow computations on AMD GPUs. diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index 4a2428b82..b03dcd6ab 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -22,6 +22,7 @@ include("gpu.jl") @testset "ic0" begin A_cpu, b_cpu = sparse_laplacian() + @test mapreduce(Aᵢᵢ -> Aᵢᵢ != 0, &, diag(A_cpu)) == true b_gpu = CuVector(b_cpu) n = length(b_gpu) @@ -30,7 +31,7 @@ include("gpu.jl") symmetric = hermitian = true A_gpu = CuSparseMatrixCSC(A_cpu) - P = ic02(A_gpu, 'O') + P = ic02(A_gpu) function ldiv_ic0!(P::CuSparseMatrixCSC, x, y, z) ldiv!(z, UpperTriangular(P)', x) ldiv!(y, UpperTriangular(P), z) @@ -39,10 +40,10 @@ include("gpu.jl") opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z)) x, stats = cg(A_gpu, b_gpu, M=opM) @test norm(b_gpu - A_gpu * x) ≤ 1e-6 - @test stats.niter ≤ 38 + @test stats.niter ≤ 19 - A_gpu = CuSparseMatrixCSR(A_cpu) - P = ic02(A_gpu, 'O') + A_gpu = CuSparseMatrixCSR(A_gpu) + P = ic02(A_gpu) function ldiv_ic0!(P::CuSparseMatrixCSR, x, y, z) ldiv!(z, LowerTriangular(P), x) ldiv!(y, LowerTriangular(P)', z) @@ -51,16 +52,22 @@ include("gpu.jl") opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z)) x, stats = cg(A_gpu, b_gpu, M=opM) @test norm(b_gpu - A_gpu * x) ≤ 1e-6 - @test stats.niter ≤ 38 + @test stats.niter ≤ 19 end @testset "ilu0" begin - A_cpu, b_cpu = polar_poisson() - - p = zfd(A_cpu, 'O') + A_cpu = Float64[1 0 0 4; + 0 0 7 8; + 9 0 0 12; + 0 14 0 16] + A_cpu = sparse(A_cpu) + b_cpu = ones(4) + @test mapreduce(Aᵢᵢ -> Aᵢᵢ != 0, &, diag(A_cpu)) == false + + p = zfd(A_cpu) p .+= 1 - A_cpu = A_cpu[p,:] - b_cpu = b_cpu[p] + invp = invperm(p) + @test reduce(&, invp .== p) == false b_gpu = CuVector(b_cpu) n = length(b_gpu) @@ -68,29 +75,31 @@ include("gpu.jl") z = similar(CuVector{T}, n) symmetric = hermitian = false - A_gpu = CuSparseMatrixCSC(A_cpu) - P = ilu02(A_gpu, 'O') + A_gpu = CuSparseMatrixCSC(A_cpu[:,p]) + P = ilu02(A_gpu) function ldiv_ilu0!(P::CuSparseMatrixCSC, x, y, z) ldiv!(z, LowerTriangular(P), x) ldiv!(y, UnitUpperTriangular(P), z) return y end opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z)) - x, stats = bicgstab(A_gpu, b_gpu, M=opM) - @test norm(b_gpu - A_gpu * x) ≤ 1e-6 - @test stats.niter ≤ 1659 + x̄, stats = gmres(A_gpu, b_gpu, M=opM) + x = Vector(x̄)[invp] + @test norm(b_gpu - A_gpu * x̄) ≤ 1e-6 + @test norm(b_cpu - A_cpu * x) ≤ 1e-6 - A_gpu = CuSparseMatrixCSR(A_cpu) - P = ilu02(A_gpu, 'O') + A_gpu = CuSparseMatrixCSR(A_cpu[:,p]) + P = ilu02(A_gpu) function ldiv_ilu0!(P::CuSparseMatrixCSR, x, y, z) ldiv!(z, UnitLowerTriangular(P), x) ldiv!(y, UpperTriangular(P), z) return y end opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(P, x, y, z)) - x, stats = bicgstab(A_gpu, b_gpu, M=opM) - @test norm(b_gpu - A_gpu * x) ≤ 1e-6 - @test stats.niter ≤ 1659 + x̄, stats = gmres(A_gpu, b_gpu, M=opM) + x = Vector(x̄)[invp] + @test norm(b_gpu - A_gpu * x̄) ≤ 1e-6 + @test norm(b_cpu - A_cpu * x) ≤ 1e-6 end end