From 3a14489b7ae9888247db21d631fd98f25cc46d6f Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Wed, 6 Mar 2024 18:51:25 +0200 Subject: [PATCH 1/2] Moved to LUMI 23.09 stack with rocm 5.6.1 (5.2 drivers AFAIK) --- MAKE/Makefile.lumi_hipcc | 28 ++++++++++++++--------- testpackage/small_test_lumi_gpu.sh | 36 +++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/MAKE/Makefile.lumi_hipcc b/MAKE/Makefile.lumi_hipcc index c0cc7942b..3744f5dc6 100644 --- a/MAKE/Makefile.lumi_hipcc +++ b/MAKE/Makefile.lumi_hipcc @@ -2,14 +2,18 @@ CMP = hipcc LNK = clang++ # Modules loaded (after clean shell, no module purging, one-by-one not oneline) -# module load LUMI/22.08 +# module load LUMI/23.09 # module load partition/G # module load cpeAMD -# module load rocm/5.3.3 -# module load Boost/1.79.0-cpeAMD-22.08 +# module load rocm/5.6.1 +# module load Boost/1.82.0-cpeAMD-23.09 +# module load papi/7.0.1.1 +# module load Eigen/3.4.0 # one-liner: -# module load LUMI/22.08; module load partition/G; module load cpeAMD; module load rocm/5.3.3; module load Boost/1.79.0-cpeAMD-22.08 - +# module load LUMI/23.09; module load partition/G; module load cpeAMD; module load rocm/5.6.1; module load Boost/1.82.0-cpeAMD-23.09; module load papi/7.0.1.1; module load Eigen/3.4.0 +# clang++ linking also requires: +# export PATH=$PATH:/appl/lumi/SW/LUMI-23.09/G/EB/rocm/5.6.1/llvm/bin/ + #======== Vectorization ========== #Set vector backend type for vlasov solvers, sets precision and length. #Options: @@ -35,9 +39,9 @@ USE_HIP=1 # LDFLAGS flags for linker # Important note: Do not edit COMPFLAGS in this file! -CXXFLAGS += -g -ggdb -O3 -x hip --offload-arch=gfx90a:xnack- -march=znver3 -std=c++17 -funroll-loops -fopenmp -I. -Ihip -Iomp -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable +CXXFLAGS += -g -ggdb -O3 -x hip --offload-arch=gfx90a:xnack- -march=znver3 -std=c++17 -funroll-loops -fopenmp -I. -Ihip -Iomp -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable -Wno-ignored-attributes -testpackage: CXXFLAGS = -g -ggdb -O2 -x hip --offload-arch=gfx90a:xnack- -march=znver3 -std=c++17 -fopenmp -I. -Ihip -Iomp -I${CRAY_MPICH_DIR}/include -fgpu-sanitize -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable +testpackage: CXXFLAGS = -g -ggdb -O2 -x hip --offload-arch=gfx90a:xnack- -march=znver3 -std=c++17 -fopenmp -I. -Ihip -Iomp -I${CRAY_MPICH_DIR}/include -fgpu-sanitize -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable -Wno-ignored-attributes LDFLAGS = -fopenmp -lrt -lpthread -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} -L${ROCM_PATH}/lib -lamdhip64 LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a} @@ -63,7 +67,7 @@ testpackage: CXXFLAGS += -DPAPI_MEM #======== Libraries =========== # Select the base directory based on which project you are using: # LUMILAPIO -LIBRARY_PREFIX = /projappl/project_462000358/libraries +LIBRARY_PREFIX = /projappl/project_462000358/libraries/23.09 # Compiled libraries #INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include @@ -76,8 +80,10 @@ LIB_ZOLTAN = -L$(LIBRARY_PREFIX)/zoltan/lib -lzoltan -Wl,-rpath=$(LIBRARY_PREFIX #INC_JEMALLOC = -I$(LIBRARY_PREFIX)/jemalloc/include #LIB_JEMALLOC = -L$(LIBRARY_PREFIX)/jemalloc/lib -ljemalloc -Wl,-rpath=$(LIBRARY_PREFIX)/jemalloc/lib -INC_PAPI = -isystem $(LIBRARY_PREFIX)/papi/include -LIB_PAPI = -lpapi -L$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib +#INC_PAPI = -isystem $(LIBRARY_PREFIX)/papi/include +#LIB_PAPI = -lpapi -L$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib +INC_PAPI = -isystem /opt/cray/pe/papi/7.0.1.1/include/ +LIB_PAPI = -lpapi -L/opt/cray/pe/papi/7.0.1.1/lib -Wl,-rpath=/opt/cray/pe/papi/7.0.1.1/lib INC_VLSV = -isystem $(LIBRARY_PREFIX)/vlsv LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Wl,-rpath=$(LIBRARY_PREFIX)/vlsv @@ -87,7 +93,7 @@ LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -lgfortran -Wl,-rpath=$( #header libraries -INC_EIGEN = -isystem $(LIBRARY_PREFIX)/eigen/ +#INC_EIGEN = -isystem $(LIBRARY_PREFIX)/eigen/ INC_FSGRID = -I./submodules/fsgrid INC_DCCRG = -I./submodules/dccrg # Vectorclass only for CPU mode diff --git a/testpackage/small_test_lumi_gpu.sh b/testpackage/small_test_lumi_gpu.sh index ba6e081a0..c361e252c 100755 --- a/testpackage/small_test_lumi_gpu.sh +++ b/testpackage/small_test_lumi_gpu.sh @@ -4,10 +4,13 @@ #SBATCH --partition=small-g #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=1 +##SBATCH --ntasks-per-node=8 +##SBATCH --gpus-per-node=8 #SBATCH --time=24:00:00 +##SBATCH --time=3:00:00 #SBATCH --account=project_462000358 #SBATCH --exclusive #SBATCH --mem=0 @@ -21,18 +24,21 @@ create_verification_files=0 reference_dir="/scratch/project_462000358/testpackage/" cd $SLURM_SUBMIT_DIR -bin="/scratch/project_462000358/testpackage/vlasiator_gpu_wid4_tp" +bin="/scratch/project_462000358/testpackage/vlasiator_gpu_2309_tp" diffbin="/scratch/project_462000358/testpackage/vlsvdiff_DP_gpu" # compare agains which revision? reference_revision="current" +# place before exec +#LD_PRELOAD=/users/marbat/git/vlasiator-mempool/libpreload-me-2309.so + # set up GPU/CPU bindings cat << EOF > select_gpu_${SLURM_JOB_ID} #!/bin/bash export ROCR_VISIBLE_DEVICES=\$SLURM_LOCALID export OMP_NUM_THREADS=7 -exec \$* +LD_PRELOAD=/users/marbat/git/vlasiator-mempool/libpreload-me-2309.so exec \$* EOF chmod +x ./select_gpu_${SLURM_JOB_ID} # this should set the ordering correctly: "4 5 2 3 6 7 0 1" @@ -41,11 +47,18 @@ CPU_BIND="${CPU_BIND},7e0000,7e000000" CPU_BIND="${CPU_BIND},7e,7e00" CPU_BIND="${CPU_BIND},7e00000000,7e0000000000" -module load LUMI/22.08 +# module load LUMI/22.08 +# module load partition/G +# module load cpeAMD +# module load rocm/5.3.3 +# module load Boost/1.79.0-cpeAMD-22.08 +module load LUMI/23.09 module load partition/G module load cpeAMD -module load rocm/5.3.3 -module load Boost/1.79.0-cpeAMD-22.08 +module load rocm/5.6.1 +module load Boost/1.82.0-cpeAMD-23.09 +module load papi/7.0.1.1 +module load Eigen/3.4.0 module list export OMP_PLACES=cores @@ -56,13 +69,14 @@ export MPICH_GPU_SUPPORT_ENABLED=1 export HSA_XNACK=0 # use extra threads for MPI in background export MPICH_ASYNC_PROGRESS=1 - +# allow 16 in-parallel queues +export GPU_MAX_HW_QUEUES=16 # Command for running tests and diffs with MPI # run_command="srun --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu " # No MPI testing for now -run_command="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu " -small_run_command="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu " -run_command_tools="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu" +run_command="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu_${SLURM_JOB_ID} " +small_run_command="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu_${SLURM_JOB_ID} " +run_command_tools="srun -n 1 --cpu-bind=${CPU_BIND} ${SLURM_SUBMIT_DIR}/select_gpu_${SLURM_JOB_ID}" umask 007 From f3bc0e44fcb0e763716784d3dcdfdc92f2ec20c7 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Thu, 7 Mar 2024 08:46:25 +0200 Subject: [PATCH 2/2] Comment out old prefetch --- grid.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grid.cpp b/grid.cpp index b4f88fbde..59e04298e 100644 --- a/grid.cpp +++ b/grid.cpp @@ -295,9 +295,9 @@ void initializeGrids( for (size_t i=0; iparameters[CellParams::LBWEIGHTCOUNTER] = 0; #ifdef USE_GPU - SpatialCell* cell = mpiGrid[cells[i]]; - cell->prefetchDevice(); // Currently projects still init on host - cell->gpu_advise(); + // SpatialCell* cell = mpiGrid[cells[i]]; + // cell->prefetchDevice(); + // cell->gpu_advise(); #endif }