diff --git a/thirdParty/cupla/.gitignore b/thirdParty/cupla/.gitignore index a79da23ddb..27970f2edd 100644 --- a/thirdParty/cupla/.gitignore +++ b/thirdParty/cupla/.gitignore @@ -8,6 +8,12 @@ /*.cbp /*.layout +# Visual Studio Code configuration files +.vscode + +# JetBrains project files +.idea/ + # python byte code *.pyc diff --git a/thirdParty/cupla/.gitlab-ci.yml b/thirdParty/cupla/.gitlab-ci.yml index df72c681bb..8d6750e24e 100644 --- a/thirdParty/cupla/.gitlab-ci.yml +++ b/thirdParty/cupla/.gitlab-ci.yml @@ -1,120 +1,78 @@ -.base_job: - script: - # the default build type is Release - # if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines - # to change the build type, you must set the environment variable CUPLA_BUILD_TYPE - - if [[ ! -v CUPLA_BUILD_TYPE ]] ; then - CUPLA_BUILD_TYPE=Release ; - fi - - echo "number of processor threads $(nproc)" - - $CXX --version - - cmake --version - # print boost version - - echo -e "#include \n#include \nint main() { std::cout << BOOST_VERSION << std::endl; return 0; }" | $CXX -x c++ - -o boost_version >/dev/null || { echo 0; } - - echo "Boost version $(./boost_version)" - - export cupla_DIR=$CI_PROJECT_DIR - # use one build directory for all build configurations - - mkdir build - - cd build - - echo "Build type-> $CUPLA_BUILD_TYPE" - # ALPAKA_ACCS contains the backends, which are used for each build - # the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda - - for CMAKE_FLAGS in $ALPAKA_ACCS ; do - echo "###################################################" - && echo "# Example Matrix Multiplication (adapted original)" - && echo "###################################################" - && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" - && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)" - && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then - cmake $cupla_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE - && make -j - && time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64 - && rm -r * ; - fi - && echo "###################################################" - && echo "# Example Async API (adapted original)" - && echo "###################################################" - && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" - && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)" - && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then - cmake $cupla_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE - && make -j - && time ./asyncAPI - && rm -r * ; - fi - && echo "###################################################" - && echo "# Example Async API (added elements layer)" - && echo "###################################################" - && cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE - && make -j - && time ./asyncAPI_tuned - && rm -r * - && echo "###################################################" - && echo "Example vectorAdd (added elements layer)" - && echo "###################################################" - && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE - && make -j - && time ./vectorAdd 100000 - && rm -r * - && echo "###################################################" - && echo "Example cuplaVectorAdd (added elements layer)" - && echo "###################################################" - && cmake $cupla_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE - && make -j - && time ./cuplaVectorAdd 100000 - && rm -r * ; - done +################################################################################ +# CUPLA_CXX : {g++, clang++} +# [g++] : {5, 6, 7, 8, 9} +# [clang++] : {4.0, 5.0, 6.0, 7, 8, 9, 10} +# CUPLA_BOOST_VERSIONS : {1.65.1, 1.66.0, 1.67.0, 1.68.0, 1.69.0, 1.70.0, 1.71.0, 1.72.0, 1.73.0} +# CUPLA_BUILD_TYPE : {Debug, Release} +# CUPLA_CMAKE_ARGS : +include: + - local: '/script/compiler_base.yml' -.base_gcc: +cuda92: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda9.2 variables: - GIT_SUBMODULE_STRATEGY: normal - CXX: g++ - CC: gcc - ALPAKA_ACCS: "-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON - -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON - -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON" - # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON - extends: .base_job - # x86_64 tag is used to get a multi-core CPU for the tests - tags: - - x86_64 + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda + +cuda100: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda10.0 + variables: + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda + +cuda101: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda10.1 + variables: + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda + +cuda102: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda10.2 + variables: + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda -.base_clang: +gcc1: variables: - GIT_SUBMODULE_STRATEGY: normal - CXX: clang++ - CC: clang - ALPAKA_ACCS: "-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON - -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON" - # -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON - # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON - extends: .base_job - # x86_64 tag is used to get a multi-core CPU for the tests - tags: - - x86_64 + CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0" + extends: .base_gcc -.base_cuda: +gcc2: variables: - GIT_SUBMODULE_STRATEGY: normal - CXX: g++ - CC: gcc - ALPAKA_ACCS: "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON" - before_script: - - nvidia-smi - - nvcc --version - extends: .base_job - tags: - - cuda - - intel + CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" + CUPLA_BOOST_VERSIONS: "1.68.0 1.69.0 1.70.0" + extends: .base_gcc -gcc7: - image: registry.gitlab.com/hzdr/cupla-docker/gcc7:latest +gcc3: + variables: + CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" + CUPLA_BOOST_VERSIONS: "1.71.0 1.72.0 1.73.0" extends: .base_gcc -clang7: - image: registry.gitlab.com/hzdr/cupla-docker/clang7:latest +clang: + variables: + CUPLA_CXX: "clang++-5.0 clang++-6.0 clang++-7 clang++-8 clang++-9 clang++-10" + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" extends: .base_clang -cuda9: - image: registry.gitlab.com/hzdr/cupla-docker/cuda9:latest - extends: .base_cuda +cudaClang92: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda9.2Clang + variables: + CUPLA_CXX: "clang++-8 clang++-9 clang++-10" + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda_clang + +cudaClang100: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda10.0Clang + variables: + CUPLA_CXX: "clang++-8 clang++-9 clang++-10" + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda_clang + +cudaClang101: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:cuda10.1Clang + variables: + CUPLA_CXX: "clang++-9 clang++-10" + CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" + extends: .base_cuda_clang diff --git a/thirdParty/cupla/INSTALL.md b/thirdParty/cupla/INSTALL.md index 64191d8b90..a9e3c51bc1 100644 --- a/thirdParty/cupla/INSTALL.md +++ b/thirdParty/cupla/INSTALL.md @@ -14,7 +14,7 @@ Requirements - `export CMAKE_PREFIX_PATH=$CUPLA_ROOT:$CMAKE_PREFIX_PATH` - example: - `mkdir -p $HOME/src` - - `git clone git://github.com/alpaka-group/cupla.git $HOME/src/cupla` + - `git clone https://github.com/alpaka-group/cupla.git $HOME/src/cupla` - `cd $HOME/src/cupla` - `export CUPLA_ROOT=$HOME/src/cupla` - use a different alpaka installation: diff --git a/thirdParty/cupla/alpaka/.github/workflows/ci.yml b/thirdParty/cupla/alpaka/.github/workflows/ci.yml index 9ae970e4a7..38f7951f1c 100644 --- a/thirdParty/cupla/alpaka/.github/workflows/ci.yml +++ b/thirdParty/cupla/alpaka/.github/workflows/ci.yml @@ -1,136 +1,448 @@ +# +# Copyright 2015-2020 Benjamin Worpitz +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + name: Continuous Integration on: [push, pull_request] +################################################################################ +# NOTE: Testing the full matrix is not practical. +# Therefore we aim to have each value been set in at lest one job. +# CXX : {g++, clang++} +# [g++] ALPAKA_CI_GCC_VER : {5, 6, 7, 8, 9, 10} +# [clang++] ALPAKA_CI_CLANG_VER : {4.0, 5.0, 6.0, 7, 8, 9, 10} +# [cl.exe] ALPAKA_CI_CL_VER : {2017, 2019} +# ALPAKA_CI_STDLIB : {libstdc++, [CXX==clang++]:libc++} +# CMAKE_BUILD_TYPE : {Debug, Release} +# ALPAKA_CI : {GITHUB} +# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME : {ubuntu:16.04, ubuntu:18.04, ubuntu:20.04} +# ALPAKA_CI_BOOST_BRANCH : {boost-1.65.1, boost-1.66.0, boost-1.67.0, boost-1.68.0, boost-1.69.0, boost-1.70.0, boost-1.71.0, boost-1.72.0, boost-1.73.0} +# ALPAKA_CI_CMAKE_VER : {3.15.7, 3.16.8, 3.17.3, 3.18.0} +# ALPAKA_CI_SANITIZERS : {ASan, UBsan, TSan} +# TSan is not currently used because it produces many unexpected errors +# ALPAKA_CI_ANALYSIS : {ON, OFF} +# ALPAKA_DEBUG : {0, 1, 2} +# ALPAKA_ACC_GPU_CUDA_ONLY_MODE : {ON, OFF} +# ALPAKA_ACC_GPU_HIP_ONLY_MODE : {ON, OFF} +# ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE : {ON, OFF} +# ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE : {ON, OFF} +# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE : {ON, OFF} +# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE : {ON, OFF} +# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} +# ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE : {ON, OFF} +# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} +# ALPAKA_ACC_CPU_BT_OMP4_ENABLE : {ON, OFF} +# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} +# ALPAKA_ACC_GPU_CUDA_ENABLE : {ON, OFF} +# [ON] ALPAKA_CUDA_VERSION : {9.0, 9.1, 9.2, 10.0, 10.1, 10.2, 11.0} +# [ON] ALPAKA_CUDA_COMPILER : {nvcc, [CXX==clang++]:clang} +# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE : {ON, OFF} +# ALPAKA_ACC_GPU_HIP_ENABLE : {ON, OFF} +# [ON] ALPAKA_CI_HIP_BRANCH : {rocm-3.5.0} +# [ON] ALPAKA_HIP_PLATFORM : {nvcc} + env: ALPAKA_CI: GITHUB TBB_ROOT: tbb - BOOST_ROOT: boost - ALPAKA_CI_BOOST_LIB_DIR: boost_libs - ALPAKA_CI_CLANG_DIR: llvm - ALPAKA_CI_CMAKE_DIR: CMake - ALPAKA_CI_CUDA_DIR: CUDA - ALPAKA_CI_HIP_ROOT_DIR: hip + BOOST_ROOT: ${{ github.workspace }}/boost + ALPAKA_CI_BOOST_LIB_DIR: ${{ github.workspace }}/boost_libs + ALPAKA_CI_CMAKE_DIR: ${{ github.workspace }}/CMake + ALPAKA_CI_CUDA_DIR: ${{ github.workspace }}/CUDA + ALPAKA_CI_HIP_ROOT_DIR: ${{ github.workspace }}/hip ALPAKA_CI_SANITIZERS: ALPAKA_CI_ANALYSIS: OFF + ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: ON + ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: ON + ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: ON + ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: ON + ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON + ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON + ALPAKA_ACC_CPU_BT_OMP4_ENABLE: ON ALPAKA_ACC_GPU_CUDA_ENABLE: OFF + ALPAKA_ACC_GPU_CUDA_ONLY_MODE: OFF ALPAKA_ACC_GPU_HIP_ENABLE: OFF + ALPAKA_ACC_GPU_HIP_ONLY_MODE: OFF + ALPAKA_CI_DOCKER_IMAGE_NAME: alpaka_ubuntu jobs: - ### Analysis builds - windows_cl-2019_debug_analysis: - name: Windows cl-2019 Debug Analysis - runs-on: windows-2019 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2} + ci: + name: ${{ matrix.name }} + runs-on: ${{ matrix.os }} + env: ${{ matrix.env }} - ### Windows - windows_cl-2017_release: - name: Windows cl-2017 Release - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.16.4, OMP_NUM_THREADS: 4, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + strategy: + fail-fast: false + matrix: + include: + ### Analysis builds + - name: linux_gcc-8_debug_analysis + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.18.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2} + - name: linux_clang-8_debug_analysis + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-9.1_gcc-5_debug_analysis + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_clang-9_cuda-9.2_debug_analysis + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 1, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: windows_cl-2019_debug_analysis + os: windows-2019 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, + ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.3_debug_analysis + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.3.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} - windows_cl-2017_debug: - name: Windows cl-2017 Debug - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.16.4, OMP_NUM_THREADS: 3, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_CXX_STANDARD: 17} + ### macOS + - name: macos_xcode-11_debug + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11_release + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.1_debug + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.1_release + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.1, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.2_debug + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.2.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.2_release + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.2.1, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.3_debug + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.3.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.3_release + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.3.1, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.4_debug + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.4, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: macos_xcode-11.4_release + os: macos-10.15 + env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 11.4, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} - windows_cl-2019_release: - name: Windows cl-2019 Release - runs-on: windows-2019 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.4, OMP_NUM_THREADS: 1, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + ### Windows + - name: windows_cl-2017_release + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 4, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + - name: windows_cl-2017_debug + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.0, OMP_NUM_THREADS: 3, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_CXX_STANDARD: 17} + - name: windows_cl-2019_release + os: windows-2019 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 1, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + - name: windows_cl-2019_debug + os: windows-2019 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 4, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} - windows_cl-2019_debug: - name: Windows cl-2019 Debug - runs-on: windows-2019 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2019, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.4, OMP_NUM_THREADS: 4, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + ## CUDA 10.0 + # nvcc + MSVC + - name: windows_nvcc-10.0_cl-2017_release_cuda-only_separable-compilation + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: windows_nvcc-10.0_cl-2017_debug + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} - ## CUDA 10.0 - # nvcc + MSVC - windows_nvcc-10_0_cl-2017_release_cuda-only_separable-compilation: - name: Windows nvcc-10.0 + cl-2017 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON} + ## CUDA 10.1 + # nvcc + MSVC + - name: windows_nvcc-10.1_cl-2017_debug_cuda-only + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: windows_nvcc-10.1_cl-2017_release + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} - windows_nvcc-10_0_cl-2017_debug: - name: Windows nvcc-10.0 + cl-2017 Debug (Only some CPU backends enabled due to compile time) - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} + ## CUDA 10.2 + # nvcc + MSVC + - name: windows_nvcc-10.2_cl-2017_debug_cuda-only + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: windows_nvcc-10.2_cl-2017_release + os: windows-2016 + env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} - ## CUDA 10.1 - # nvcc + MSVC - windows_nvcc-10_1_cl-2017_debug_cuda-only: - name: Windows nvcc-10.1 + cl-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON} + ### Ubuntu + ## native + # g++ + # We can not enable UBSan when using gcc because it does not have a -fsanitize-blacklist option to suppress errors in boost etc. + # gcc 6 ASan is triggered within libtbb.so + # gcc 7 ASan introduced 'stack-use-after-scope' which is triggered by GOMP_parallel + - name: linux_gcc-5_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04"} + - name: linux_gcc-6_debug_c++17 + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CXX_STANDARD: 17} + - name: linux_gcc-7_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04"} + - name: linux_gcc-8_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04"} + - name: linux_gcc-9_debug_c++17 + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_gcc-10_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 10, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} - windows_nvcc-10_1_cl-2017_release: - name: Windows nvcc-10.1 + cl-2017 Release (Only some CPU backends enabled due to compile time) - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - - name: build + test - shell: bash - run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} - ## CUDA 10.2 - # nvcc + MSVC - windows_nvcc-10_2_cl-2017_debug_cuda-only: - name: Windows nvcc-10.2 + cl-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE - runs-on: windows-2016 + # clang++ + - name: linux_clang-4_debug_ubsan + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_CI_SANITIZERS: UBSan, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-5_debug_c++17 + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "5.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-6_release_asan_c++17 + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_SANITIZERS: ASan, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-7_release_c++17 + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-8_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.18.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-9_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-10_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 9.0 + # nvcc + g++ + - name: linux_nvcc-9.0_gcc-5_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # clang++ + - name: linux_clang-6_cuda-9.0_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-7_cuda-9.0_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-8_cuda-9.0_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 9.1 + # nvcc + g++ + - name: linux_nvcc-9.1_gcc-5_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-9.1_clang-4_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + # clang++ + - name: linux_clang-7_cuda-9.1_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-8_cuda-9.1_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 9.2 + # nvcc + g++ + - name: linux_nvcc-9.2_gcc-5_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-9.2_gcc-6_debug_separable_compilation + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-9.2_gcc-7_release_extended_lambda_off + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-9.2_clang-4_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + # clang++ + - name: linux_clang-7_cuda-9.2_release_c++17 + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CXX_STANDARD: 17, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-8_cuda-9.2_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-9_cuda-9.2_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-10_cuda-9.2_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 10.0 + # nvcc + g++ + - name: linux_nvcc-10.0_gcc-5_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.0_gcc-6_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.5, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.0_gcc-7_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-10.0_clang-4_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;60", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.0_clang-5_release_separable_compilation + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "5.0", ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.0_clang-6_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + # clang++ + - name: linux_clang-8_cuda-10.0_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-9_cuda-10.0_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-10_cuda-10.0_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 10.1 + # nvcc + g++ + - name: linux_nvcc-10.1_gcc-5_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.1_gcc-6_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.1_gcc-7_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.1_gcc-8_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-10.1_clang-4_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.1_clang-5_release_cuda_only + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "5.0", ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF} + - name: linux_nvcc-10.1_clang-6_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "60", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.1_clang-7_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.1_clang-8_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + # clang++ + - name: linux_clang-9_cuda-10.1_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_clang-10_cuda-10.1_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 10.2 + # nvcc + g++ + - name: linux_nvcc-10.2_gcc-5_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.2_gcc-6_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.2_gcc-7_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-10.2_gcc-8_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_EMU_MEMCPY3D: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-10.2_clang-4_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;60", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.2_clang-5_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "5.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.2_clang-6_release_cuda_only + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF} + - name: linux_nvcc-10.2_clang-7_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-10.2_clang-8_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## CUDA 11.0 + # nvcc + g++ + - name: linux_nvcc-11.0_gcc-5_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;80", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-11.0_gcc-6_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 6, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-11.0_gcc-7_debug + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-11.0_gcc-8_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_EMU_MEMCPY3D: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + - name: linux_nvcc-11.0_gcc-9_release + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF} + # nvcc + clang++ + - name: linux_nvcc-11.0_clang-4_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "4.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;60", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-11.0_clang-5_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "5.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "80", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-11.0_clang-6_release_cuda_only + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: "6.0", ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF} + - name: linux_nvcc-11.0_clang-7_debug + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 7, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-11.0_clang-8_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 8, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.8, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + - name: linux_nvcc-11.0_clang-9_release + os: ubuntu-latest + env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.17.3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70", ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF} + + ## HIP + - name: linux_hip_nvcc-9.2_gcc-5_debug_hip_only + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "rrawther/rocm3.5_ubuntu16.04_py3.6_pytorch-ssd", ALPAKA_ACC_GPU_HIP_ENABLE: ON, ALPAKA_ACC_GPU_HIP_ONLY_MODE: ON, ALPAKA_HIP_PLATFORM: clang, ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_GPU_CUDA_ENABLE: OFF} + - name: linux_hip_nvcc-9.2_gcc-5_release_hip_only + os: ubuntu-latest + env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 5, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.18.0, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "rrawther/rocm3.5_ubuntu16.04_py3.6_pytorch-ssd", ALPAKA_ACC_GPU_HIP_ENABLE: ON, ALPAKA_ACC_GPU_HIP_ONLY_MODE: ON, ALPAKA_HIP_PLATFORM: clang, ALPAKA_CUDA_COMPILER: clang, ALPAKA_EMU_MEMCPY3D: ON, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_GPU_CUDA_ENABLE: OFF} + steps: - uses: actions/checkout@v1 + if: (!contains(github.event.head_commit.message, 'ci_filter') || contains(github.event.head_commit.message, matrix.name )) - name: build + test + if: (!contains(github.event.head_commit.message, 'ci_filter') || contains(github.event.head_commit.message, matrix.name )) && (runner.os == 'Windows') + env: + ALPAKA_CI_OS_NAME: ${{runner.os}} shell: bash run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Debug, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON} - - windows_nvcc-10_2_cl-2017_release: - name: Windows nvcc-10.2 + cl-2017 Release (Only some CPU backends enabled due to compile time) - runs-on: windows-2016 - steps: - - uses: actions/checkout@v1 - name: build + test - shell: bash + if: (!contains(github.event.head_commit.message, 'ci_filter') || contains(github.event.head_commit.message, matrix.name )) && (runner.os == 'Linux' || runner.os == 'macOS') + env: + ALPAKA_CI_OS_NAME: ${{runner.os}} run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh - env: {TRAVIS_OS_NAME: windows, CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2017, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.16.4, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF} diff --git a/thirdParty/cupla/alpaka/.gitignore b/thirdParty/cupla/alpaka/.gitignore index 8ffbdc9ac0..3be3354aec 100644 --- a/thirdParty/cupla/alpaka/.gitignore +++ b/thirdParty/cupla/alpaka/.gitignore @@ -1,7 +1,5 @@ -/doc/doxygen/* -!/doc/doxygen/Doxyfile -!/doc/doxygen/alpaka_doxygen.png -/doc/latex/* +/docs/doxygen/* + **/build # tmp files diff --git a/thirdParty/cupla/alpaka/.readthedocs.yml b/thirdParty/cupla/alpaka/.readthedocs.yml new file mode 100644 index 0000000000..bec3c9ff2a --- /dev/null +++ b/thirdParty/cupla/alpaka/.readthedocs.yml @@ -0,0 +1,18 @@ +# Default [] (epub, pdf, htmlzip) +# Note: PDF/epub/htmlzip output is not supported when using MkDocs +formats: [] + +requirements_file: docs/requirements.txt + +build: + image: latest + +python: + version: 3.7 + +sphinx: + builder: html + configuration: conf.py + fail_on_warning: true + +# see: https://docs.readthedocs.io/en/stable/config-file/v2.html#supported-settings diff --git a/thirdParty/cupla/alpaka/.travis.yml b/thirdParty/cupla/alpaka/.travis.yml deleted file mode 100644 index b5de39ef11..0000000000 --- a/thirdParty/cupla/alpaka/.travis.yml +++ /dev/null @@ -1,311 +0,0 @@ -# -# Copyright 2015-2019 Benjamin Worpitz, Erik Zenker -# -# This file is part of Alpaka. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# - -os: linux -dist: xenial -language: generic -services: - - docker - -################################################################################ -# NOTE: Testing the full matrix is not practical. -# Therefore we aim to have each value been set in at lest one job. -# CXX : {g++, clang++} -# [g++] ALPAKA_CI_GCC_VER : {5, 6, 7, 8, 9} -# [clang++] ALPAKA_CI_CLANG_VER : {4.0.0, 5.0.2, 6.0.1, 7.0.1, 8.0.0, 9.0.0} -# [cl.exe] ALPAKA_CI_CL_VER : {2017, 2019} -# ALPAKA_CI_STDLIB : {libstdc++, [CXX==clang++]:libc++} -# [clang++] ALPAKA_CI_CLANG_LIBSTDCPP_VERSION : {5, 7} -# CMAKE_BUILD_TYPE : {Debug, Release} -# ALPAKA_CI : {TRAVIS} -# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME : {ubuntu:16.04, ubuntu:18.04} -# ALPAKA_CI_BOOST_BRANCH : {boost-1.65.1, boost-1.66.0, boost-1.67.0, boost-1.68.0, boost-1.69.0, boost-1.70.0, boost-1.71.0} -# ALPAKA_CI_CMAKE_VER : {3.15.7, 3.16.4} -# ALPAKA_CI_SANITIZERS : {ASan, UBsan, TSan} -# TSan is not currently used because it produces many unexpected errors -# ALPAKA_CI_ANALYSIS : {ON, OFF} -# ALPAKA_DEBUG : {0, 1, 2} -# ALPAKA_ACC_GPU_CUDA_ONLY_MODE : {ON, OFF} -# ALPAKA_ACC_GPU_HIP_ONLY_MODE : {ON, OFF} -# ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE : {ON, OFF} -# ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE : {ON, OFF} -# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE : {ON, OFF} -# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE : {ON, OFF} -# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} -# ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE : {ON, OFF} -# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} -# ALPAKA_ACC_CPU_BT_OMP4_ENABLE : {ON, OFF} -# [ON] OMP_NUM_THREADS : {1, 2, 3, 4} -# ALPAKA_ACC_GPU_CUDA_ENABLE : {ON, OFF} -# [ON] ALPAKA_CUDA_VERSION : {9.0, 9.1, 9.2, 10.0, 10.1, 10.2} -# [ON] ALPAKA_CUDA_COMPILER : {nvcc, [CXX==clang++]:clang} -# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE : {ON, OFF} -# ALPAKA_ACC_GPU_HIP_ENABLE : {ON, OFF} -# [ON] ALPAKA_CI_HIP_BRANCH : {master} -# [ON] ALPAKA_HIP_PLATFORM : {nvcc} -env: - global: - - ALPAKA_CI=TRAVIS - - ALPAKA_CI_DOCKER_IMAGE_NAME=alpaka_ubuntu - - ALPAKA_CI_DOCKER_CACHE_DIR=${HOME}/cache/docker - - ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH=${ALPAKA_CI_DOCKER_CACHE_DIR}/${ALPAKA_CI_DOCKER_IMAGE_NAME}.tar.gz - - BOOST_ROOT=${HOME}/boost - - ALPAKA_CI_BOOST_LIB_DIR=${HOME}/boost_libs/ - - ALPAKA_CI_CLANG_DIR=${HOME}/llvm - - ALPAKA_CI_CMAKE_DIR=${HOME}/CMake - - ALPAKA_CI_CUDA_DIR=${HOME}/CUDA - - ALPAKA_CI_HIP_ROOT_DIR=${HOME}/hip - - TBB_ROOT=${HOME}/tbb - - ALPAKA_CI_SANITIZERS= - - ALPAKA_CI_ANALYSIS=OFF - - ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=5 - - ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - ALPAKA_ACC_GPU_HIP_ENABLE=OFF - -matrix: - include: - ### Analysis builds - - name: nvcc-9.1 + gcc-5 Debug Analysis - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_CI_ANALYSIS=ON ALPAKA_DEBUG=2 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc - - name: gcc-8 Debug Analysis - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=8 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_CI_ANALYSIS=ON ALPAKA_DEBUG=2 - - name: clang-6 + CUDA-9.0 Debug Analysis - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_CI_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang - - name: clang-8 Debug Analysis - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_CI_ANALYSIS=ON ALPAKA_DEBUG=2 - - name: macOS 10.14 Xcode 11.2 Debug Analysis - os: osx - osx_image: xcode11.2 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_ANALYSIS=ON ALPAKA_DEBUG=2 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - ### macOS - - name: macOS 10.14 Xcode 10.2.1 Debug - os: osx - osx_image: xcode10.2 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - name: macOS 10.14 Xcode 10.2.1 Release - os: osx - osx_image: xcode10.2 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - - name: macOS 10.14.4 Xcode 10.3 Debug - os: osx - osx_image: xcode10.3 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - name: macOS 10.14.4 Xcode 10.3 Release - os: osx - osx_image: xcode10.3 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - - name: macOS 10.14 Xcode 11.0 Debug - os: osx - osx_image: xcode11 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - name: macOS 10.14 Xcode 11.0 Release - os: osx - osx_image: xcode11 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - - name: macOS 10.14 Xcode 11.1 Debug - os: osx - osx_image: xcode11.1 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - name: macOS 10.14 Xcode 11.1 Release - os: osx - osx_image: xcode11.1 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - - name: macOS 10.14 Xcode 11.2 Debug - os: osx - osx_image: xcode11.2 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - name: macOS 10.14 Xcode 11.2 Release - os: osx - osx_image: xcode11.2 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - - name: macOS 10.14 Xcode 11.3 Debug - os: osx - osx_image: xcode11.3 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Debug ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF - - name: macOS 10.14 Xcode 11.3 Release - os: osx - osx_image: xcode11.3 - env: CXX=g++ CC=gcc CMAKE_BUILD_TYPE=Release ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=17 - - ### Ubuntu - ## native - # g++ - # We can not enable UBSan when using gcc because it does not have a -fsanitize-blacklist option to suppress errors in boost etc. - # gcc 6 ASan is triggered within libtbb.so - # gcc 7 ASan introduced 'stack-use-after-scope' which is triggered by GOMP_parallel - - name: gcc-5 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=3 - - name: gcc-6 Debug c++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=6 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.16.4 OMP_NUM_THREADS=2 ALPAKA_CXX_STANDARD=17 - - name: gcc-7 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=7 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=1 - - name: gcc-8 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=8 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.4 OMP_NUM_THREADS=4 - - name: gcc-9 Debug c++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=9 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=17 ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF - - # clang++ - - name: clang-4 Debug UBSan - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 OMP_NUM_THREADS=4 ALPAKA_CI_SANITIZERS=UBSan - - name: clang-5 Debug c++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=17 ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=7 - - name: clang-6 Release ASan C++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=2 ALPAKA_CI_SANITIZERS=ASan ALPAKA_CXX_STANDARD=17 - - name: clang-7 Release c++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 OMP_NUM_THREADS=1 ALPAKA_CXX_STANDARD=17 ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=7 - - name: clang-8 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.15.7 OMP_NUM_THREADS=4 - - name: clang-9 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.4 OMP_NUM_THREADS=1 - - ## CUDA 9.0 - # nvcc + g++ - - name: nvcc-9.0 + gcc-5 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" - # clang++ - - name: clang-6 + CUDA-9.0 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35" - - name: clang-7 + CUDA-9.0 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;70" - - name: clang-8 + CUDA-9.0 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang - - ## CUDA 9.1 - # nvcc + g++ - - name: nvcc-9.1 + gcc-5 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc - # nvcc + clang++ - - name: nvcc-9.1 + clang-4 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70" - # clang++ - - name: clang-7 + CUDA-9.1 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72" - - name: clang-8 + CUDA-9.1 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang - - ## CUDA 9.2 - # nvcc + g++ - - name: nvcc-9.2 + gcc-5 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-9.2 + gcc-6 Debug separable compilation - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=6 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON - - name: nvcc-9.2 + gcc-7 Release + extended lambda off - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=7 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=OFF - # nvcc + clang++ - - name: nvcc-9.2 + clang-4 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70" - # clang++ - - name: clang-7 + CUDA-9.2 Release c++17 - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72" ALPAKA_CXX_STANDARD=17 - - name: clang-8 + CUDA-9.2 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang - - name: clang-9 + CUDA-9.2 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang - - ## CUDA 10.0 - # nvcc + g++ - - name: nvcc-10.0 + gcc-5 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.0 + gcc-6 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=6 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.0 + gcc-7 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=7 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" - # nvcc + clang++ - - name: nvcc-10.0 + clang-4 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60" - - name: nvcc-10.0 + clang-5 Debug separable compilation - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON - - name: nvcc-10.0 + clang-6 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" - # clang++ - - name: clang-8 + CUDA-10.0 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang - - name: clang-9 + CUDA-10.0 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang - - ## CUDA 10.1 - # nvcc + g++ - - name: nvcc-10.1 + gcc-5 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.1 + gcc-6 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=6 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.1 + gcc-7 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=7 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" - - name: nvcc-10.1 + gcc-8 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=8 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" - # nvcc + clang++ - - name: nvcc-10.1 + clang-4 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60" - - name: nvcc-10.1 + clang-5 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON - - name: nvcc-10.1 + clang-6 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" - - name: nvcc-10.1 + clang-7 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" - - name: nvcc-10.1 + clang-8 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" - # clang++ - - name: clang-9 + CUDA-10.1 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=clang - - ## CUDA 10.2 - # nvcc + g++ - - name: nvcc-10.2 + gcc-5 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" - - name: nvcc-10.2 + gcc-6 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=6 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.2 + gcc-7 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=7 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc - - name: nvcc-10.2 + gcc-8 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=8 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" - # nvcc + clang++ - - name: nvcc-10.2 + clang-4 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60" - - name: nvcc-10.2 + clang-5 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" - - name: nvcc-10.2 + clang-6 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON - - name: nvcc-10.2 + clang-7 Debug - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" - - name: nvcc-10.2 + clang-8 Release - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" - - ## HIP - - name: HIP(nvcc9.2) + gcc-5 Debug ALPAKA_ACC_GPU_HIP_ONLY_MODE - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Debug ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.7 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-3.1.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc - - name: HIP(nvcc9.2) + gcc-5 Release ALPAKA_ACC_GPU_HIP_ONLY_MODE - env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++ CC=gcc ALPAKA_CI_GCC_VER=5 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.4 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-3.1.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc - -branches: - except: - - gh-pages - -cache: - directories: - - $ALPAKA_CI_DOCKER_CACHE_DIR - -script: - - set -eovx pipefail - - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install smem ;fi - - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install moreutils ;fi - - if [ "$TRAVIS_OS_NAME" = "osx" ] ;then brew install moreutils ;fi - - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] ;then ./script/ci.sh | ts ;fi - -after_failure: - - ./script/after_failure.sh - -notifications: - email: false diff --git a/thirdParty/cupla/alpaka/.zenodo.json b/thirdParty/cupla/alpaka/.zenodo.json index 80d29a6ce4..5958f8b0a4 100644 --- a/thirdParty/cupla/alpaka/.zenodo.json +++ b/thirdParty/cupla/alpaka/.zenodo.json @@ -1,6 +1,6 @@ { - "title": "Alpaka: Abstraction Library for Parallel Kernel Acceleration", - "description": "The alpaka library is a header-only C++11 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.", + "title": "alpaka: Abstraction Library for Parallel Kernel Acceleration", + "description": "The alpaka library is a header-only C++14 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.", "creators": [ { "affiliation": "LogMeIn, Inc.", diff --git a/thirdParty/cupla/alpaka/CMakeLists.txt b/thirdParty/cupla/alpaka/CMakeLists.txt index 19ee551e9a..985a9e9ded 100644 --- a/thirdParty/cupla/alpaka/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/CMakeLists.txt @@ -31,7 +31,7 @@ set(PACKAGE_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VE project(alpaka VERSION ${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH} DESCRIPTION "The alpaka library is a header-only C++14 abstraction library for accelerator development." - HOMEPAGE_URL "https://github.com/ComputationalRadiationPhysics/alpaka" + HOMEPAGE_URL "https://github.com/alpaka-group/alpaka" LANGUAGES CXX) set_property(GLOBAL PROPERTY USE_FOLDERS ON) @@ -39,10 +39,10 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON) ################################################################################ # Options and Variants -option(alpaka_BUILD_EXAMPLES "Build the examples" ON) +option(alpaka_BUILD_EXAMPLES "Build the examples" OFF) +option(BUILD_TESTING "Build the testing tree." OFF) include(CTest) -# automatically defines: BUILD_TESTING, default is ON ################################################################################ # Internal variables. @@ -87,11 +87,11 @@ append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/cmake" "${_ALPAKA_R list(APPEND _ALPAKA_FILES_CMAKE "${_ALPAKA_ROOT_DIR}/cmake/alpakaConfig.cmake.in" "${_ALPAKA_ROOT_DIR}/CMakeLists.txt") set_source_files_properties(${_ALPAKA_FILES_CMAKE} PROPERTIES HEADER_FILE_ONLY TRUE) -append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/doc/markdown" "${_ALPAKA_ROOT_DIR}" "md" _ALPAKA_FILES_DOC) +append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/docs/markdown" "${_ALPAKA_ROOT_DIR}" "md" _ALPAKA_FILES_DOC) set_source_files_properties(${_ALPAKA_FILES_DOC} PROPERTIES HEADER_FILE_ONLY TRUE) append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/.github" "${_ALPAKA_ROOT_DIR}" "yml" _ALPAKA_FILES_OTHER) -list(APPEND _ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/.gitignore" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/.zenodo.json" "${_ALPAKA_ROOT_DIR}/LICENSE" "${_ALPAKA_ROOT_DIR}/README.md") +list(APPEND _ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/.gitignore" "${_ALPAKA_ROOT_DIR}/.zenodo.json" "${_ALPAKA_ROOT_DIR}/LICENSE" "${_ALPAKA_ROOT_DIR}/README.md") set_source_files_properties(${_ALPAKA_FILES_OTHER} PROPERTIES HEADER_FILE_ONLY TRUE) if(TARGET alpaka) @@ -141,38 +141,42 @@ endif() ################################################################################ # Installation. -include(CMakePackageConfigHelpers) -include(GNUInstallDirs) - -set(_ALPAKA_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/alpaka") - -install(TARGETS alpaka - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - -write_basic_package_version_file( - "alpakaConfigVersion.cmake" - VERSION ${PROJECT_VERSION} - COMPATIBILITY SameMajorVersion) - -configure_package_config_file( - "${_ALPAKA_ROOT_DIR}/cmake/alpakaConfig.cmake.in" - "${PROJECT_BINARY_DIR}/alpakaConfig.cmake" - INSTALL_DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") - -install(FILES "${PROJECT_BINARY_DIR}/alpakaConfig.cmake" - "${PROJECT_BINARY_DIR}/alpakaConfigVersion.cmake" - DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") - -install(DIRECTORY "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") - -install(FILES "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake" - "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake" - "${_ALPAKA_ROOT_DIR}/cmake/alpakaCommon.cmake" - "${_ALPAKA_ROOT_DIR}/cmake/common.cmake" - DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") - -install(DIRECTORY "${_ALPAKA_ROOT_DIR}/cmake/modules" - DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") + +# Do not install if alpaka is used as a CMake subdirectory +if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME}) + include(CMakePackageConfigHelpers) + include(GNUInstallDirs) + + set(_ALPAKA_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/alpaka") + + install(TARGETS alpaka + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + + write_basic_package_version_file( + "alpakaConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion) + + configure_package_config_file( + "${_ALPAKA_ROOT_DIR}/cmake/alpakaConfig.cmake.in" + "${PROJECT_BINARY_DIR}/alpakaConfig.cmake" + INSTALL_DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") + + install(FILES "${PROJECT_BINARY_DIR}/alpakaConfig.cmake" + "${PROJECT_BINARY_DIR}/alpakaConfigVersion.cmake" + DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") + + install(DIRECTORY "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + + install(FILES "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake" + "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake" + "${_ALPAKA_ROOT_DIR}/cmake/alpakaCommon.cmake" + "${_ALPAKA_ROOT_DIR}/cmake/common.cmake" + DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") + + install(DIRECTORY "${_ALPAKA_ROOT_DIR}/cmake/modules" + DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}") +endif() diff --git a/thirdParty/cupla/alpaka/README.md b/thirdParty/cupla/alpaka/README.md index ac6a256081..cb4c4f64f2 100644 --- a/thirdParty/cupla/alpaka/README.md +++ b/thirdParty/cupla/alpaka/README.md @@ -1,12 +1,15 @@ **alpaka** - Abstraction Library for Parallel Kernel Acceleration ================================================================= -[![Travis CI Build Status](https://travis-ci.org/ComputationalRadiationPhysics/alpaka.svg?branch=develop)](https://travis-ci.org/ComputationalRadiationPhysics/alpaka) + +[![Continuous Integration](https://github.com/alpaka-group/alpaka/workflows/Continuous%20Integration/badge.svg)](https://github.com/alpaka-group/alpaka/actions?query=workflow%3A%22Continuous+Integration%22) +[![Documentation Status](https://readthedocs.org/projects/alpaka/badge/?version=latest)](https://alpaka.readthedocs.io) +[![Doxygen](https://img.shields.io/badge/API-Doxygen-blue.svg)](https://alpaka-group.github.io/alpaka) [![Language](https://img.shields.io/badge/language-C%2B%2B11-orange.svg)](https://isocpp.org/) -[![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac-lightgrey.svg)](https://github.com/ComputationalRadiationPhysics/alpaka) +[![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac-lightgrey.svg)](https://github.com/alpaka-group/alpaka) [![License](https://img.shields.io/badge/license-MPL--2.0-blue.svg)](https://www.mozilla.org/en-US/MPL/2.0/) -![Alpaka](doc/images/alpaka_401x135.png) +![alpaka](docs/logo/alpaka_401x135.png) The **alpaka** library is a header-only C++14 abstraction library for accelerator development. @@ -38,8 +41,9 @@ Software License Documentation ------------- -The [general documentation](doc/markdown/Index.md) is located within the `doc/markdown` subfolder of the repository. -The [source code documentation](http://computationalradiationphysics.github.io/alpaka/) is generated with [doxygen](http://www.doxygen.org). +The alpaka documentation can be found in the [online manual](https://alpaka.readthedocs.io). +The documentation files in [`.rst` (reStructuredText)](https://www.sphinx-doc.org/en/stable/rest.html) format are located in the `docs` subfolder of this repository. +The [source code documentation](https://alpaka-group.github.io/alpaka/) is generated with [doxygen](http://www.doxygen.org). Accelerator Back-ends @@ -55,7 +59,7 @@ Accelerator Back-ends | Boost.Fiber | boost::fibers::fiber |Host CPU (single core)|sequential|parallel (cooperative multitasking)| |TBB|TBB 2.2+|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)| |CUDA|CUDA 9.0-10.2|NVIDIA GPUs|parallel (undefined)|parallel (lock-step within warps)| -|HIP(nvcc)|[HIP 3.1+](https://github.com/ROCm-Developer-Tools/HIP)|NVIDIA GPUs SM 2.0+|parallel (undefined)|parallel (lock-step within warps)| +|HIP(clang)|[HIP 3.5+](https://github.com/ROCm-Developer-Tools/HIP)|AMD GPUs |parallel (undefined)|parallel (lock-step within warps)| Supported Compilers @@ -63,18 +67,18 @@ Supported Compilers This library uses C++14 (or newer when available). -|Accelerator Back-end|gcc 5.5
(Linux)|gcc 6.4/7.3
(Linux)|gcc 8.1/9.1
(Linux)|clang 4
(Linux)|clang 5
(Linux)|clang 6
(Linux)|clang 7
(Linux)|clang 8
(Linux)|clang 9
(Linux)|Apple LLVM 10.2-11.2
(macOS)|MSVC 2017/2019
(Windows)| -|---|---|---|---|---|---|---|---|---|---|---|---| -|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|OpenMP 2.0+ blocks|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| -|OpenMP 2.0+ threads|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| -|OpenMP 4.0+ (CPU)|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:| -| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| -|TBB|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|CUDA (nvcc)|:white_check_mark:
(CUDA 9.0-10.2)|:white_check_mark:
(CUDA 9.2-10.2) |:x:|:white_check_mark:
(CUDA 9.1-10.2)|:white_check_mark:
(CUDA 10.1-10.2)|:white_check_mark:
(CUDA 10.1-10.2)|:white_check_mark:
(CUDA 10.1-10.2)|:white_check_mark:
(CUDA 10.1-10.2)|:x:|:x:|:white_check_mark:
(CUDA 10.0-10.2)| -|CUDA (clang) | - | - | - | - | - | :white_check_mark:
(CUDA 9.0) | :white_check_mark:
(CUDA 9.0-9.2) | :white_check_mark:
(CUDA 9.0-10.0) | :white_check_mark:
(CUDA 9.2-10.1) | - | - | -|[HIP](doc/markdown/user/implementation/mapping/HIP.md) (nvcc)|:white_check_mark:
(nvcc 9.0+)|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:| +|Accelerator Back-end|gcc 5.5
(Linux)|gcc 6.4/7.3
(Linux)|gcc 8.1
(Linux)|gcc 9.1
(Linux)|gcc 10.1
(Linux)|clang 4
(Linux)|clang 5
(Linux)|clang 6
(Linux)|clang 7
(Linux)|clang 8
(Linux)|clang 9
(Linux)|clang 10
(Linux)|Apple LLVM 11.0-11.4
(macOS)|MSVC 2017/2019
(Windows)| +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +|OpenMP 2.0+ blocks|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| +|OpenMP 2.0+ threads|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| +|OpenMP 4.0+ (CPU)|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:| +| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:| +|TBB|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +|CUDA (nvcc)|:white_check_mark:
(CUDA 9.0-11.0)|:white_check_mark:
(CUDA 9.2-11.0) |:white_check_mark:
(CUDA 10.1-11.0) |:white_check_mark:
(CUDA 11.0)|:x:|:white_check_mark:
(CUDA 9.1-11.0)|:white_check_mark:
(CUDA 10.1-11.0)|:white_check_mark:
(CUDA 10.1-11.0)|:white_check_mark:
(CUDA 10.1-11.0)|:white_check_mark:
(CUDA 10.1-11.0)|:white_check_mark:
(CUDA 11.0)|:x:|:x:|:white_check_mark:
(CUDA 10.0-10.2)| +|CUDA (clang) | - | - | - | - | - | - | - | :white_check_mark:
(CUDA 9.0) | :white_check_mark:
(CUDA 9.0-9.2) | :white_check_mark:
(CUDA 9.0-10.0) | :white_check_mark:
(CUDA 9.2-10.1) | :white_check_mark:
(CUDA 9.2-10.1) | - | - | +|[HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang)|:white_check_mark: |:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:| Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported. @@ -86,12 +90,12 @@ Dependencies The **alpaka** library itself just requires header-only libraries. However some of the accelerator back-end implementations require different boost libraries to be built. -When an accelerator back-end using *Boost.Fiber* is enabled, `boost-fiber` and all of its dependencies are required to be built in C++11 mode `./b2 cxxflags="-std=c++11"`. +When an accelerator back-end using *Boost.Fiber* is enabled, `boost-fiber` and all of its dependencies are required to be built in C++14 mode `./b2 cxxflags="-std=c++14"`. When *Boost.Fiber* is enabled and alpaka is built in C++17 mode with clang and libstc++, Boost >= 1.67.0 is required. When an accelerator back-end using *CUDA* is enabled, version *9.0* of the *CUDA SDK* is the minimum requirement. *NOTE*: When using nvcc as *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with the *Boost.Fiber accelerator back-end* due to bugs in the nvcc compiler. -*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported. +*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with the *Boost.Fiber accelerator back-end* or any *OpenMP accelerator back-end* because this combination is currently unsupported. *NOTE*: Separable compilation is only supported when using nvcc, not with clang as native *CUDA* compiler. It is disabled by default and can be enabled via the CMake flag `ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION`. When an accelerator back-end using *OpenMP* is enabled, the compiler and the platform have to support the corresponding minimum *OpenMP* version. @@ -111,7 +115,7 @@ Examples of how to utilize alpaka within CMake can be found in the `example` fol The whole alpaka library can be included with: `#include ` Code that is not intended to be utilized by the user is hidden in the `detail` namespace. -Furthermore, for a CUDA-like experience when adopting alpaka we provide the library [*cupla*](https://github.com/ComputationalRadiationPhysics/cupla). +Furthermore, for a CUDA-like experience when adopting alpaka we provide the library [*cupla*](https://github.com/alpaka-group/cupla). It enables a simple and straightforward way of porting existing CUDA applications to alpaka and thus to a variety of accelerators. Introduction diff --git a/thirdParty/cupla/alpaka/cmake/addExecutable.cmake b/thirdParty/cupla/alpaka/cmake/addExecutable.cmake index aa33f63757..3cb8dd651d 100644 --- a/thirdParty/cupla/alpaka/cmake/addExecutable.cmake +++ b/thirdParty/cupla/alpaka/cmake/addExecutable.cmake @@ -1,7 +1,7 @@ # # Copyright 2014-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/cmake/addLibrary.cmake b/thirdParty/cupla/alpaka/cmake/addLibrary.cmake index e2f09641c8..8ee02ceb47 100644 --- a/thirdParty/cupla/alpaka/cmake/addLibrary.cmake +++ b/thirdParty/cupla/alpaka/cmake/addLibrary.cmake @@ -1,7 +1,7 @@ # # Copyright 2015-2019 Benjamin Worpitz, Maximilian Knespel # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake b/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake index cc1e21e714..db2c533617 100644 --- a/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake +++ b/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake @@ -1,7 +1,8 @@ # # Copyright 2014-2020 Benjamin Worpitz, Erik Zenker, Axel Huebl, Jan Stephan +# Rene Widera # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,80 +13,50 @@ include(CMakePrintHelpers) # for easier printing of variables and properties #------------------------------------------------------------------------------- # Options. -set(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT ON) -set(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT ON) +set(ALPAKA_EMU_MEMCPY3D_DEFAULT OFF) # HIP and platform selection and warning about unsupported features option(ALPAKA_ACC_GPU_HIP_ENABLE "Enable the HIP back-end (all other back-ends must be disabled)" OFF) option(ALPAKA_ACC_GPU_HIP_ONLY_MODE "Only back-ends using HIP can be enabled in this mode." OFF) # HIP only runs without other back-ends # Drop-down combo box in cmake-gui for HIP platforms. -set(ALPAKA_HIP_PLATFORM "nvcc" CACHE STRING "Specify HIP platform") +set(ALPAKA_HIP_PLATFORM "clang" CACHE STRING "Specify HIP platform") set_property(CACHE ALPAKA_HIP_PLATFORM PROPERTY STRINGS "nvcc;clang") if(ALPAKA_ACC_GPU_HIP_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ONLY_MODE AND ALPAKA_HIP_PLATFORM MATCHES "nvcc") - message(WARNING "HIP back-end must be used together with ALPAKA_ACC_GPU_HIP_ONLY_MODE") - set(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "" FORCE) + message(FATAL_ERROR "HIP back-end must be used together with ALPAKA_ACC_GPU_HIP_ONLY_MODE") endif() if(ALPAKA_ACC_GPU_HIP_ENABLE AND ALPAKA_HIP_PLATFORM MATCHES "clang") message(WARNING "The HIP back-end is currently experimental." - "Alpaka HIP backend compiled with clang does not support callback functions." + "alpaka HIP backend compiled with clang does not support callback functions." ) endif() +option(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU back-end" OFF) option(ALPAKA_ACC_GPU_CUDA_ONLY_MODE "Only back-ends using CUDA can be enabled in this mode (This allows to mix alpaka code with native CUDA code)." OFF) -# If CUDA-only mode is enabled, we set the defaults for all CPU back-ends to OFF. -# If they are explicitly set via the command line, the user will get an error later on. -if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE) # CUDA-only or HIP-only - set(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF) -endif() - -option(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU back-end" ON) - -# If CUDA is enabled, we set the defaults for some unsupported back-ends to OFF. -# If they are explicitly set via the command line, the user will get an error later on. -if(ALPAKA_ACC_GPU_CUDA_ENABLE) - set(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF) - if(ALPAKA_CUDA_COMPILER MATCHES "clang") - set(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF) - set(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF) - endif() -endif() if(ALPAKA_ACC_GPU_HIP_ENABLE) - set(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF) + set(ALPAKA_EMU_MEMCPY3D_DEFAULT ON) endif() if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND NOT ALPAKA_ACC_GPU_CUDA_ENABLE) - message(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, ALPAKA_ACC_GPU_CUDA_ENABLE has to be enabled as well.") - set(_ALPAKA_FOUND FALSE) + message(FATAL_ERROR "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, ALPAKA_ACC_GPU_CUDA_ENABLE has to be enabled as well.") endif() if(ALPAKA_ACC_GPU_HIP_ONLY_MODE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE) - message(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, ALPAKA_ACC_GPU_HIP_ENABLE has to be enabled as well.") - set(_ALPAKA_FOUND FALSE) + message(FATAL_ERROR "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, ALPAKA_ACC_GPU_HIP_ENABLE has to be enabled as well.") endif() -option(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE "Enable the serial CPU back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE "Enable the TBB CPU grid block back-end" ${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block back-end" ${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT}) -option(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread back-end" ${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT}) +option(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE "Enable the serial CPU back-end" OFF) +option(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block thread back-end" OFF) +option(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread back-end" OFF) +option(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE "Enable the TBB CPU grid block back-end" OFF) +option(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block back-end" OFF) +option(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread back-end" OFF) +option(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread back-end" OFF) + +option(ALPAKA_EMU_MEMCPY3D "Emulate internal used hip/cuda-Memcpy3D(async) with a kernel" ${ALPAKA_EMU_MEMCPY3D_DEFAULT}) if((ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE) AND @@ -97,12 +68,11 @@ if((ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE) ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR ALPAKA_ACC_CPU_BT_OMP4_ENABLE)) if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) - message(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, only back-ends using CUDA can be enabled! This allows to mix alpaka code with native CUDA code. However, this prevents any non-CUDA back-ends from being enabled.") + message(FATAL_ERROR "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, only back-ends using CUDA can be enabled! This allows to mix alpaka code with native CUDA code. However, this prevents any non-CUDA back-ends from being enabled.") endif() if(ALPAKA_ACC_GPU_HIP_ONLY_MODE) - message(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, only back-ends using HIP can be enabled!") + message(FATAL_ERROR "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, only back-ends using HIP can be enabled!") endif() - set(_ALPAKA_FOUND FALSE) endif() # avoids CUDA+HIP conflict @@ -112,8 +82,7 @@ endif() # HIP is only supported on Linux if(ALPAKA_ACC_GPU_HIP_ENABLE AND (MSVC OR WIN32)) - message(WARNING "Optional alpaka dependency HIP can not be built on Windows! HIP back-end disabled!") - set(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE) + message(FATAL_ERROR "Optional alpaka dependency HIP can not be built on Windows!") endif() # Drop-down combo box in cmake-gui. @@ -125,12 +94,15 @@ set_property(CACHE ALPAKA_CXX_STANDARD PROPERTY STRINGS "14;17;20") if(NOT TARGET alpaka) add_library(alpaka INTERFACE) - + target_compile_features(alpaka INTERFACE cxx_std_${ALPAKA_CXX_STANDARD}) add_library(alpaka::alpaka ALIAS alpaka) endif() +option(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST "Allow host-only contructs like assert in offload code in debug mode." ON) +set(ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB "30" CACHE STRING "Kibibytes (1024B) of memory to allocate for block shared memory for backends requiring static allocation (includes CPU_B_OMP2_T_SEQ, CPU_B_TBB_T_SEQ, CPU_B_SEQ_T_SEQ)") + #------------------------------------------------------------------------------- # Debug output of common variables. if(${ALPAKA_DEBUG} GREATER 1) @@ -186,16 +158,10 @@ find_package(Boost ${_ALPAKA_BOOST_MIN_VER} REQUIRED target_link_libraries(alpaka INTERFACE Boost::headers) -if(Boost_FIBER_FOUND) - if(MSVC AND (${CMAKE_SIZEOF_VOID_P} EQUAL 4)) - # On Win32 boost context triggers: - # libboost_context-vc141-mt-gd-1_64.lib(jump_i386_ms_pe_masm.obj) : error LNK2026: module unsafe for SAFESEH image. - target_link_options(Boost::fiber INTERFACE "/SAFESEH:NO") +if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE) + if(NOT Boost_FIBER_FOUND) + message(FATAL_ERROR "Optional alpaka dependency Boost.Fiber could not be found!") endif() - target_link_libraries(alpaka INTERFACE Boost::fiber) -else() - message(STATUS "Optional alpaka dependency Boost.Fiber could not be found! Fiber back-end disabled!") - set(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OFF CACHE BOOL "Enable the Boost.Fiber CPU back-end" FORCE) endif() if(${ALPAKA_DEBUG} GREATER 1) @@ -257,8 +223,7 @@ if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE) if(TBB_FOUND) target_link_libraries(alpaka INTERFACE TBB::tbb) else() - message(STATUS "Optional alpaka dependency TBB could not be found! TBB grid block back-end disabled!") - set(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OFF CACHE BOOL "Enable the TBB grid block back-end" FORCE) + message(FATAL_ERROR "Optional alpaka dependency TBB could not be found!") endif() endif() @@ -268,21 +233,20 @@ if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR A find_package(OpenMP) if(OpenMP_CXX_FOUND) - if(OpenMP_CXX_VERSION VERSION_LESS 4.0) - set(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE) + if(ALPAKA_ACC_CPU_BT_OMP4_ENABLE) + if(OpenMP_CXX_VERSION VERSION_LESS 4.0) + message(FATAL_ERROR "OpenMP 4.0 is required but not supported!") + endif() endif() target_link_libraries(alpaka INTERFACE OpenMP::OpenMP_CXX) # Clang versions starting from 3.9 support OpenMP 4.0 only when given the corresponding flag if(ALPAKA_ACC_CPU_BT_OMP4_ENABLE) - target_link_options(alpaka INTERFACE $<$:"-fopenmp-version=40">) + target_link_options(alpaka INTERFACE $<$:-fopenmp-version=40>) endif() else() - message(STATUS "Optional alpaka dependency OpenMP could not be found! OpenMP back-ends disabled!") - set(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU grid block back-end" FORCE) - set(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU block thread back-end" FORCE) - set(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE) + message(FATAL_ERROR "Optional alpaka dependency OpenMP could not be found!") endif() endif() @@ -295,14 +259,12 @@ if(ALPAKA_ACC_GPU_CUDA_ENABLE) endif() if(ALPAKA_CUDA_VERSION VERSION_LESS 9.0) - message(WARNING "CUDA Toolkit < 9.0 is not supported!") - set(_ALPAKA_FOUND FALSE) + message(FATAL_ERROR "CUDA Toolkit < 9.0 is not supported!") else() find_package(CUDA "${ALPAKA_CUDA_VERSION}") if(NOT CUDA_FOUND) - message(STATUS "Optional alpaka dependency CUDA could not be found! CUDA back-end disabled!") - set(ALPAKA_ACC_GPU_CUDA_ENABLE OFF CACHE BOOL "Enable the CUDA GPU back-end" FORCE) + message(FATAL_ERROR "Optional alpaka dependency CUDA could not be found!") else() set(ALPAKA_CUDA_VERSION "${CUDA_VERSION}") if(CUDA_VERSION VERSION_LESS 10.3) @@ -488,8 +450,8 @@ if(ALPAKA_ACC_GPU_CUDA_ENABLE) foreach(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH}) # set flags to create device code for the given architecture list(APPEND CUDA_NVCC_FLAGS - --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=sm_${_CUDA_ARCH_ELEM} - --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=compute_${_CUDA_ARCH_ELEM} + --generate-code=arch=compute_${_CUDA_ARCH_ELEM},code=sm_${_CUDA_ARCH_ELEM} + --generate-code=arch=compute_${_CUDA_ARCH_ELEM},code=compute_${_CUDA_ARCH_ELEM} ) endforeach() @@ -524,30 +486,31 @@ if(ALPAKA_ACC_GPU_CUDA_ENABLE) endif() # Always add warning/error numbers which can be used for suppressions - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --display_error_number) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--display_error_number) # avoids warnings on host-device signatured, default constructors/destructors - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=esa_on_defaulted_function_ignored) # avoids warnings on host-device signature of 'std::__shared_count<>' if(CUDA_VERSION EQUAL 10.0) - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2905) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2905) elseif(CUDA_VERSION EQUAL 10.1) - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2912) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2912) elseif(CUDA_VERSION EQUAL 10.2) - list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2976) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2976) endif() if(ALPAKA_CUDA_KEEP_FILES) file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/nvcc_tmp") - list(APPEND CUDA_NVCC_FLAGS --keep --keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp) + list(APPEND CUDA_NVCC_FLAGS --keep) + list(APPEND CUDA_NVCC_FLAGS --keep-dir="${PROJECT_BINARY_DIR}/nvcc_tmp") endif() option(ALPAKA_CUDA_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF) if(ALPAKA_CUDA_SHOW_CODELINES) list(APPEND CUDA_NVCC_FLAGS --source-in-ptx -lineinfo) if(NOT MSVC) - list(APPEND CUDA_NVCC_FLAGS -Xcompiler -rdynamic) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler=-rdynamic) endif() set(ALPAKA_CUDA_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE) endif() @@ -570,12 +533,11 @@ endif() if(ALPAKA_ACC_GPU_HIP_ENABLE) if(NOT DEFINED ALPAKA_HIP_VERSION) - set(ALPAKA_HIP_VERSION 3.1) + set(ALPAKA_HIP_VERSION 3.5) endif() - if(ALPAKA_HIP_VERSION VERSION_LESS 3.1) - message(WARNING "HIP < 3.1 is not supported!") - set(_ALPAKA_FOUND FALSE) + if(ALPAKA_HIP_VERSION VERSION_LESS 3.5) + message(FATAL_ERROR "HIP < 3.5 is not supported!") else() # must set this for HIP package (note that you also need certain env vars) set(HIP_PLATFORM "${ALPAKA_HIP_PLATFORM}" CACHE STRING "") @@ -583,8 +545,7 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) find_package(HIP "${ALPAKA_HIP_VERSION}") if(NOT HIP_FOUND) - message(WARNING "Optional alpaka dependency HIP could not be found! HIP back-end disabled!") - set(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE) + message(FATAL_ERROR "Optional alpaka dependency HIP could not be found!") else() set(ALPAKA_HIP_VERSION "${HIP_VERSION}") set(ALPAKA_HIP_COMPILER "hipcc" CACHE STRING "HIP compiler") @@ -603,11 +564,14 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) message(WARNING "Could not find CUDA while HIP platform is set to nvcc. Compilation might fail.") endif() - set(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture") + if(CUDA_VERSION VERSION_LESS 10.3) + set(ALPAKA_HIP_ARCH "30" CACHE STRING "GPU architecture") + else() + set(ALPAKA_HIP_ARCH "35" CACHE STRING "GPU architecture") + endif() if(CUDA_VERSION VERSION_LESS 9.0) - message(WARNING "CUDA Toolkit < 9.0 is not supported!") - set(_ALPAKA_FOUND FALSE) + message(FATAL_ERROR "CUDA Toolkit < 9.0 is not supported!") endif() if(${ALPAKA_DEBUG} GREATER 1) @@ -618,18 +582,18 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) list(APPEND HIP_NVCC_FLAGS --expt-relaxed-constexpr) list(APPEND _ALPAKA_HIP_LIBRARIES "cudart") - foreach(_HIP_ARCH_ELEM ${ALPAKA_CUDA_ARCH}) + foreach(_HIP_ARCH_ELEM ${ALPAKA_HIP_ARCH}) # set flags to create device code for the given architecture list(APPEND CUDA_NVCC_FLAGS - --generate-code arch=compute_${_HIP_ARCH_ELEM},code=sm_${_HIP_ARCH_ELEM} - --generate-code arch=compute_${_HIP_ARCH_ELEM},code=compute_${_HIP_ARCH_ELEM} + --generate-code=arch=compute_${_HIP_ARCH_ELEM},code=sm_${_HIP_ARCH_ELEM} + --generate-code=arch=compute_${_HIP_ARCH_ELEM},code=compute_${_HIP_ARCH_ELEM} ) endforeach() # for CUDA cmake automatically adds compiler flags as nvcc does not do this, # but for HIP we have to do this here list(APPEND HIP_NVCC_FLAGS -D__CUDACC__) list(APPEND HIP_NVCC_FLAGS -ccbin ${CMAKE_CXX_COMPILER}) - list(APPEND HIP_NVCC_FLAGS -Xcompiler -g) + list(APPEND HIP_NVCC_FLAGS -Xcompiler=-g) if((CMAKE_BUILD_TYPE STREQUAL "Debug") OR (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")) list(APPEND HIP_NVCC_FLAGS -G) @@ -638,7 +602,7 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) # SET(CUDA_PROPAGATE_HOST_FLAGS ON) # does not exist in HIP, so do it manually string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config) foreach( _flag ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}}) - list(APPEND HIP_NVCC_FLAGS -Xcompiler ${_flag}) + list(APPEND HIP_NVCC_FLAGS -Xcompiler=${_flag}) endforeach() if(ALPAKA_HIP_FAST_MATH) @@ -656,7 +620,7 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) endif() # avoids warnings on host-device signatured, default constructors/destructors - list(APPEND HIP_HIPCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) + list(APPEND HIP_HIPCC_FLAGS -Xcudafe=--diag_suppress=esa_on_defaulted_function_ignored) # random numbers library ( HIP(NVCC) ) /hiprand # HIP_ROOT_DIR is set by FindHIP.cmake @@ -679,24 +643,59 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) endif() target_include_directories(alpaka INTERFACE ${HIP_RAND_INC}) target_link_libraries(alpaka INTERFACE ${HIP_RAND_LIBRARY}) - endif() # nvcc + elseif(ALPAKA_HIP_PLATFORM MATCHES "clang") + # # hiprand requires ROCm implementation of random numbers by rocrand + find_package(rocrand REQUIRED CONFIG + HINTS "${HIP_ROOT_DIR}/rocrand" + HINTS "/opt/rocm/rocrand") + if(rocrand_FOUND) + target_include_directories(alpaka INTERFACE ${rocrand_INCLUDE_DIRS}) + # ATTENTION: rocRand libraries are not required by alpaka + else() + MESSAGE(FATAL_ERROR "Could not find rocRAND (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}/rocrand).") + endif() - list(APPEND HIP_HIPCC_FLAGS "-D__HIPCC__") - list(APPEND HIP_HIPCC_FLAGS "-std=c++${ALPAKA_CXX_STANDARD}") + # possible architectures can be found https://github.com/llvm/llvm-project/blob/master/clang/lib/Basic/Cuda.cpp#L65 + # 900 -> AMD Vega64 + # 902 -> AMD Vega 10 + # 906 -> AMD Radeon VII, MI50/MI60 + # 908 -> AMD MI100 + set(ALPAKA_HIP_ARCH "906;908" CACHE STRING "AMD GPU architecture e.g. 906 for MI50/Radeon VII") + + foreach(_HIP_ARCH_ELEM ${ALPAKA_HIP_ARCH}) + # set flags to create device code for the given architecture + list(APPEND HIP_HIPCC_FLAGS --amdgpu-target=gfx${_HIP_ARCH_ELEM}) + endforeach() + endif() + + # # HIP random numbers + FIND_PACKAGE(hiprand REQUIRED CONFIG + HINTS "${HIP_ROOT_DIR}/hiprand" + HINTS "/opt/rocm/hiprand") + if(hiprand_FOUND) + target_include_directories(alpaka INTERFACE ${hiprand_INCLUDE_DIRS}) + # ATTENTION: hipRand libraries are not required by alpaka + else() + MESSAGE(FATAL_ERROR "Could not find hipRAND (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}/hiprand).") + endif() + + list(APPEND HIP_HIPCC_FLAGS -D__HIPCC__) + list(APPEND HIP_HIPCC_FLAGS -std=c++${ALPAKA_CXX_STANDARD}) if((CMAKE_BUILD_TYPE STREQUAL "Debug") OR (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")) - list(APPEND HIP_HIPCC_FLAGS "-g") + list(APPEND HIP_HIPCC_FLAGS -g) endif() if(ALPAKA_HIP_KEEP_FILES) file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/hip_tmp") - list(APPEND HIP_HIPCC_FLAGS "--keep" "--keep-dir" "${PROJECT_BINARY_DIR}/hip_tmp") + list(APPEND HIP_HIPCC_FLAGS --keep) + list(APPEND HIP_HIPCC_FLAGS --keep-dir "${PROJECT_BINARY_DIR}/hip_tmp") endif() option(ALPAKA_HIP_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF) if(ALPAKA_HIP_SHOW_CODELINES) - list(APPEND HIP_HIPCC_FLAGS "--source-in-ptx" "-lineinfo") - list(APPEND HIP_HIPCC_FLAGS "-Xcompiler" "-rdynamic") + list(APPEND HIP_HIPCC_FLAGS --source-in-ptx -lineinfo) + list(APPEND HIP_HIPCC_FLAGS -Xcompiler=rdynamic) set(ALPAKA_HIP_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE) endif() if(_ALPAKA_HIP_LIBRARIES) @@ -729,6 +728,14 @@ if(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE) endif() if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE) target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED") + + if(MSVC AND (${CMAKE_SIZEOF_VOID_P} EQUAL 4)) + # On Win32 boost context triggers: + # libboost_context-vc141-mt-gd-1_64.lib(jump_i386_ms_pe_masm.obj) : error LNK2026: module unsafe for SAFESEH image. + target_link_options(Boost::fiber INTERFACE "/SAFESEH:NO") + endif() + target_link_libraries(alpaka INTERFACE Boost::fiber) + message(STATUS ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED) endif() if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE) @@ -756,7 +763,15 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) message(STATUS ALPAKA_ACC_GPU_HIP_ENABLED) endif() +if(ALPAKA_EMU_MEMCPY3D) + target_compile_definitions(alpaka INTERFACE "ALPAKA_EMU_MEMCPY3D_ENABLED") +endif() + target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG=${ALPAKA_DEBUG}") +if(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) + target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST") +endif() +target_compile_definitions(alpaka INTERFACE "ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB=${ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB}") if(ALPAKA_CI) target_compile_definitions(alpaka INTERFACE "ALPAKA_CI") @@ -789,12 +804,6 @@ if(ALPAKA_ACC_GPU_HIP_ENABLE) set_property(TARGET alpaka PROPERTY INTERFACE_LINK_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC}) endif() - if(ALPAKA_HIP_PLATFORM MATCHES "clang") - # GFX600, GFX601, GFX700, GFX701, GFX702, GFX703, GFX704, GFX801, GFX802, GFX803, GFX810, GFX900, GFX902 - target_link_options(alpaka INTERFACE "--amdgpu-target=gfx803") - target_link_options(alpaka INTERFACE "--amdgpu-target=gfx900") - target_link_options(alpaka INTERFACE "--amdgpu-target=gfx906") - endif() endif() #------------------------------------------------------------------------------- @@ -847,5 +856,10 @@ if((ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) AND ALPAKA_CUDA_COM PROPERTY INTERFACE_COMPILE_OPTIONS) string(REPLACE ";" " " _ALPAKA_COMPILE_OPTIONS_STRING "${_ALPAKA_COMPILE_OPTIONS_PUBLIC}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_ALPAKA_COMPILE_OPTIONS_STRING}") + + # Append CMAKE_CXX_FLAGS_[Release|Debug|RelWithDebInfo] to CMAKE_CXX_FLAGS + # because FindCUDA only propagates the latter to nvcc. + string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}}") endif() diff --git a/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in b/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in index c82274ffbd..a8a4687db5 100644 --- a/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in +++ b/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Erik Zenker, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/cmake/common.cmake b/thirdParty/cupla/alpaka/cmake/common.cmake index 74ed5c1924..6041cbf187 100644 --- a/thirdParty/cupla/alpaka/cmake/common.cmake +++ b/thirdParty/cupla/alpaka/cmake/common.cmake @@ -1,7 +1,7 @@ # # Copyright 2014-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake index dd55e18228..cc7f4af20c 100644 --- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake +++ b/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake @@ -1,74 +1,36 @@ -# /* -# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# */ - ############################################################################### # FindHIP.cmake ############################################################################### - +include(CheckCXXCompilerFlag) ############################################################################### # SET: Variable defaults ############################################################################### # User defined flags set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC") set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC") +set(HIP_CLANG_FLAGS "" CACHE STRING "Semicolon delimited flags for CLANG") set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC") -mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS) +mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_CLANG_FLAGS HIP_NVCC_FLAGS) set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo) list(REMOVE_DUPLICATES _hip_configuration_types) foreach(config ${_hip_configuration_types}) string(TOUPPER ${config} config_upper) set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC") set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC") + set(HIP_CLANG_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for CLANG") set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC") - mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper}) + mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_CLANG_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper}) endforeach() option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON) option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file. With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF) mark_as_advanced(HIP_HOST_COMPILATION_CPP) ############################################################################### -# Set HIP CMAKE Flags +# FIND: HIP and associated helper binaries ############################################################################### -# Copy the invocation styles from CXX to HIP -set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE}) -set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND}) -set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH}) -set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG}) -set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS}) -set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}) -#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS}) -set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}) -set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP}) -set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS}) -set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS}) -# Set the CMake Flags to use the HCC Compilier. -set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} -o ") -set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} -o -shared" ) -set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} -o ") +get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../" REALPATH) -############################################################################### -# FIND: HIP and associated helper binaries -############################################################################### # HIP is supported on Linux only if(UNIX AND NOT APPLE AND NOT CYGWIN) # Search for HIP installation @@ -76,32 +38,15 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN) # Search in user specified path first find_path( HIP_ROOT_DIR - NAMES hipconfig + NAMES bin/hipconfig PATHS - ENV ROCM_PATH + "$ENV{ROCM_PATH}/hip" ENV HIP_PATH - PATH_SUFFIXES bin - DOC "HIP installed location" - NO_DEFAULT_PATH - ) - # Now search in default path - find_path( - HIP_ROOT_DIR - NAMES hipconfig - PATHS - /opt/rocm + ${_IMPORT_PREFIX} /opt/rocm/hip - PATH_SUFFIXES bin DOC "HIP installed location" + NO_DEFAULT_PATH ) - - # Check if we found HIP installation - if(HIP_ROOT_DIR) - # If so, fix the path - string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" HIP_ROOT_DIR ${HIP_ROOT_DIR}) - # And push it back to the cache - set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE) - endif() if(NOT EXISTS ${HIP_ROOT_DIR}) if(HIP_FIND_REQUIRED) message(FATAL_ERROR "Specify HIP_ROOT_DIR") @@ -109,6 +54,8 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN) message("HIP_ROOT_DIR not found or specified") endif() endif() + # And push it back to the cache + set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE) endif() # Find HIPCC executable @@ -202,6 +149,28 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN) set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig") mark_as_advanced(HIP_PLATFORM) endif() + + if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_COMPILER) + # Compute the compiler + execute_process( + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --compiler + OUTPUT_VARIABLE _hip_compiler + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + set(HIP_COMPILER ${_hip_compiler} CACHE STRING "HIP compiler as computed by hipconfig") + mark_as_advanced(HIP_COMPILER) + endif() + + if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_RUNTIME) + # Compute the runtime + execute_process( + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --runtime + OUTPUT_VARIABLE _hip_runtime + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + set(HIP_RUNTIME ${_hip_runtime} CACHE STRING "HIP runtime as computed by hipconfig") + mark_as_advanced(HIP_RUNTIME) + endif() endif() include(FindPackageHandleStandardArgs) @@ -212,9 +181,59 @@ find_package_handle_standard_args( HIP_HIPCC_EXECUTABLE HIP_HIPCONFIG_EXECUTABLE HIP_PLATFORM + HIP_COMPILER + HIP_RUNTIME VERSION_VAR HIP_VERSION ) +############################################################################### +# Set HIP CMAKE Flags +############################################################################### +# Copy the invocation styles from CXX to HIP +set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE}) +set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND}) +set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH}) +set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG}) +set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS}) +set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}) +#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS}) +set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}) +set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP}) +set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS}) +set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS}) + +set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "") +set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "") + +if("${HIP_COMPILER}" STREQUAL "hcc") + # Set the CMake Flags to use the HCC Compiler. + set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") + set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared" ) + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") +elseif("${HIP_COMPILER}" STREQUAL "clang") + #Number of parallel jobs by default is 1 + if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS) + set(HIP_CLANG_NUM_PARALLEL_JOBS 1) + endif() + #Add support for parallel build and link + if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS) + endif() + if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1) + if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS}) + set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral") + set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}") + else() + message("clang compiler doesn't support parallel jobs") + endif() + endif() + + # Set the CMake Flags to use the HIP-Clang Compiler. + set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") + set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o -shared" ) + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") +endif() + ############################################################################### # MACRO: Locate helper files ############################################################################### @@ -247,11 +266,13 @@ hip_find_helper_file(run_hipcc cmake) macro(HIP_RESET_FLAGS) unset(HIP_HIPCC_FLAGS) unset(HIP_HCC_FLAGS) + unset(HIP_CLANG_FLAGS) unset(HIP_NVCC_FLAGS) foreach(config ${_hip_configuration_types}) string(TOUPPER ${config} config_upper) unset(HIP_HIPCC_FLAGS_${config_upper}) unset(HIP_HCC_FLAGS_${config_upper}) + unset(HIP_CLANG_FLAGS_${config_upper}) unset(HIP_NVCC_FLAGS_${config_upper}) endforeach() endmacro() @@ -259,27 +280,37 @@ endmacro() ############################################################################### # MACRO: Separate the options from the sources ############################################################################### -macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options) +macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _clang_options _nvcc_options) set(${_sources}) set(${_cmake_options}) set(${_hipcc_options}) set(${_hcc_options}) + set(${_clang_options}) set(${_nvcc_options}) set(_hipcc_found_options FALSE) set(_hcc_found_options FALSE) + set(_clang_found_options FALSE) set(_nvcc_found_options FALSE) foreach(arg ${ARGN}) if("x${arg}" STREQUAL "xHIPCC_OPTIONS") set(_hipcc_found_options TRUE) set(_hcc_found_options FALSE) + set(_clang_found_options FALSE) set(_nvcc_found_options FALSE) elseif("x${arg}" STREQUAL "xHCC_OPTIONS") set(_hipcc_found_options FALSE) set(_hcc_found_options TRUE) + set(_clang_found_options FALSE) + set(_nvcc_found_options FALSE) + elseif("x${arg}" STREQUAL "xCLANG_OPTIONS") + set(_hipcc_found_options FALSE) + set(_hcc_found_options FALSE) + set(_clang_found_options TRUE) set(_nvcc_found_options FALSE) elseif("x${arg}" STREQUAL "xNVCC_OPTIONS") set(_hipcc_found_options FALSE) set(_hcc_found_options FALSE) + set(_clang_found_options FALSE) set(_nvcc_found_options TRUE) elseif( "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR @@ -293,6 +324,8 @@ macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_op list(APPEND ${_hipcc_options} ${arg}) elseif(_hcc_found_options) list(APPEND ${_hcc_options} ${arg}) + elseif(_clang_found_options) + list(APPEND ${_clang_options} ${arg}) elseif(_nvcc_found_options) list(APPEND ${_nvcc_options} ${arg}) else() @@ -426,9 +459,10 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files endforeach() endif() - HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) + HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _clang_options _nvcc_options ${ARGN}) HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options}) HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options}) + HIP_PARSE_HIPCC_OPTIONS(HIP_CLANG_FLAGS ${_clang_options}) HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options}) # Add the compile definitions @@ -450,6 +484,7 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS if(_hip_build_shared_libs) list(APPEND HIP_HCC_FLAGS "-fPIC") + list(APPEND HIP_CLANG_FLAGS "-fPIC") list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'") endif() @@ -460,12 +495,14 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})") set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})") set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})") + set(_HIP_CLANG_FLAGS "set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS})") set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})") foreach(config ${_hip_configuration_types}) string(TOUPPER ${config} config_upper) set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})") set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})") set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})") + set(_HIP_CLANG_FLAGS "${_HIP_CLANG_FLAGS}\nset(HIP_CLANG_FLAGS_${config_upper} ${HIP_CLANG_FLAGS_${config_upper}})") set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})") endforeach() @@ -571,15 +608,34 @@ endmacro() ############################################################################### macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options - HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) - HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _clang_options _nvcc_options ${ARGN}) + HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} CLANG_OPTIONS ${_clang_options} NVCC_OPTIONS ${_nvcc_options}) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) endif() - if("x${HCC_HOME}" STREQUAL "x") - set(HCC_HOME "/opt/rocm/hcc") + if("${HIP_COMPILER}" STREQUAL "hcc") + if("x${HCC_HOME}" STREQUAL "x") + if (DEFINED $ENV{ROCM_PATH}) + set(HCC_HOME "$ENV{ROCM_PATH}/hcc") + elseif( DEFINED $ENV{HIP_PATH}) + set(HCC_HOME "$ENV{HIP_PATH}/../hcc") + else() + set(HCC_HOME "/opt/rocm/hcc") + endif() + endif() + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") + elseif("${HIP_COMPILER}" STREQUAL "clang") + if("x${HIP_CLANG_PATH}" STREQUAL "x") + if (DEFINED $ENV{ROCM_PATH}) + set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin") + elseif( DEFINED $ENV{HIP_PATH}) + set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin") + else() + set(HIP_CLANG_PATH "/opt/rocm/llvm/bin") + endif() + endif() + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") endif() - set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP) endmacro() @@ -589,8 +645,8 @@ endmacro() ############################################################################### macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options - HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) - HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _clang_options _nvcc_options ${ARGN}) + HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} CLANG_OPTIONS ${_clang_options} NVCC_OPTIONS ${_nvcc_options}) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) endif() diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake index c9582bdbd4..ed025fefe5 100644 --- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake +++ b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake @@ -1,25 +1,3 @@ -# /* -# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# */ - ############################################################################### # Runs commands using HIPCC ############################################################################### @@ -49,12 +27,16 @@ set(HIP_HOST_COMPILER "@HIP_HOST_COMPILER@") # path set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path set(HIP_run_make2cmake "@HIP_run_make2cmake@") # path set(HCC_HOME "@HCC_HOME@") #path +set(HIP_CLANG_PATH "@HIP_CLANG_PATH@") #path +set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "@HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS@") @HIP_HOST_FLAGS@ @_HIP_HIPCC_FLAGS@ @_HIP_HCC_FLAGS@ +@_HIP_CLANG_FLAGS@ @_HIP_NVCC_FLAGS@ -set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly) +#Needed to bring the HIP_HIPCC_INCLUDE_ARGS variable in scope +set(HIP_HIPCC_INCLUDE_ARGS @HIP_HIPCC_INCLUDE_ARGS@) # list set(cmake_dependency_file "@cmake_dependency_file@") # path set(source_file "@source_file@") # path @@ -62,13 +44,23 @@ set(host_flag "@host_flag@") # bool # Determine compiler and compiler flags execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform OUTPUT_VARIABLE HIP_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE) +execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --compiler OUTPUT_VARIABLE HIP_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) +execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --runtime OUTPUT_VARIABLE HIP_RUNTIME OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT host_flag) set(__CC ${HIP_HIPCC_EXECUTABLE}) - if(HIP_PLATFORM STREQUAL "hcc") - if(NOT "x${HCC_HOME}" STREQUAL "x") - set(ENV{HCC_HOME} ${HCC_HOME}) + if("${HIP_PLATFORM}" STREQUAL "hcc") + if("${HIP_COMPILER}" STREQUAL "hcc") + if(NOT "x${HCC_HOME}" STREQUAL "x") + set(ENV{HCC_HOME} ${HCC_HOME}) + endif() + set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_HCC_FLAGS_${build_configuration}}) + elseif("${HIP_COMPILER}" STREQUAL "clang") + if(NOT "x${HIP_CLANG_PATH}" STREQUAL "x") + set(ENV{HIP_CLANG_PATH} ${HIP_CLANG_PATH}) + endif() + # Temporarily include HIP_HCC_FLAGS for HIP-Clang for PyTorch builds + set(__CC_FLAGS ${HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS} ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_CLANG_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_CLANG_FLAGS_${build_configuration}}) endif() - set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_HCC_FLAGS_${build_configuration}}) else() set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_NVCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_NVCC_FLAGS_${build_configuration}}) endif() diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake index 48a51fa039..d2e3eb5169 100644 --- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake +++ b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake @@ -1,25 +1,3 @@ -# /* -# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# */ - ############################################################################### # Computes dependencies using HIPCC ############################################################################### diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindROCR.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindROCR.cmake new file mode 100644 index 0000000000..2b198dcf8f --- /dev/null +++ b/thirdParty/cupla/alpaka/cmake/modules/FindROCR.cmake @@ -0,0 +1,16 @@ +# Try to find ROCR (Radeon Open Compute Runtime) +# +# Once found, this will define: +# - ROCR_FOUND - ROCR status (found or not found) +# - ROCR_INCLUDES - Required ROCR include directories +# - ROCR_LIBRARIES - Required ROCR libraries +find_path(FIND_ROCR_INCLUDES hsa.h HINTS /opt/rocm/include /opt/rocm/hsa/include PATH_SUFFIXES hsa) +find_library(FIND_ROCR_LIBRARIES hsa-runtime64 HINTS /opt/rocm/lib /opt/rocm/hsa/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ROCR DEFAULT_MSG + FIND_ROCR_INCLUDES FIND_ROCR_LIBRARIES) +mark_as_advanced(FIND_ROCR_INCLUDES FIND_ROCR_LIBRARIES) + +set(ROCR_INCLUDES ${FIND_ROCR_INCLUDES}) +set(ROCR_LIBRARIES ${FIND_ROCR_LIBRARIES}) diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindROCT.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindROCT.cmake new file mode 100644 index 0000000000..37f08fcff7 --- /dev/null +++ b/thirdParty/cupla/alpaka/cmake/modules/FindROCT.cmake @@ -0,0 +1,16 @@ +# Try to find ROCT (Radeon Open Compute Thunk) +# +# Once found, this will define: +# - ROCT_FOUND - ROCT status (found or not found) +# - ROCT_INCLUDES - Required ROCT include directories +# - ROCT_LIBRARIES - Required ROCT libraries +find_path(FIND_ROCT_INCLUDES hsakmt.h HINTS /opt/rocm/include) +find_library(FIND_ROCT_LIBRARIES hsakmt HINTS /opt/rocm/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ROCT DEFAULT_MSG + FIND_ROCT_INCLUDES FIND_ROCT_LIBRARIES) +mark_as_advanced(FIND_ROCT_INCLUDES FIND_ROCT_LIBRARIES) + +set(ROCT_INCLUDES ${FIND_ROCT_INCLUDES}) +set(ROCT_LIBRARIES ${FIND_ROCT_LIBRARIES}) diff --git a/thirdParty/cupla/alpaka/doc/markdown/Index.md b/thirdParty/cupla/alpaka/doc/markdown/Index.md deleted file mode 100644 index 7eb03d2581..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/Index.md +++ /dev/null @@ -1,19 +0,0 @@ -* User Documentation - * 1. [Introduction](user/Introduction.md) - * 2. [Abstraction](user/Abstraction.md) - * 1. [Thread](user/abstraction/Thread.md) - * 2. [Block](user/abstraction/Block.md) - * 3. [Warp](user/abstraction/Warp.md) - * 4. [Element](user/abstraction/Element.md) - * 3. [Implementation](user/Implementation.md) - * 1. [Library Interface](user/implementation/Library.md) - * 1. [Structure](user/implementation/library/Structure.md) - * 2. [Usage](user/implementation/library/Usage.md) - * 3. [Rationale](user/implementation/library/Rationale.md) - * 4. [Details](user/implementation/library/Details.md) - * 2. [Mapping onto Specific Hardware Architectures](user/implementation/Mapping.md) - * 1. [CUDA GPUs](user/implementation/mapping/CUDA.md) - * 2. [x86 CPUs](user/implementation/mapping/x86.md) - * 3. [Accelerators](user/implementation/mapping/Accelerators.md) -* Developer Documentation - * 1. [Code Formatting](dev/style.md) diff --git a/thirdParty/cupla/alpaka/doc/markdown/dev/style.md b/thirdParty/cupla/alpaka/doc/markdown/dev/style.md deleted file mode 100644 index 70841cf6d1..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/dev/style.md +++ /dev/null @@ -1,146 +0,0 @@ -[:arrow_up: Up](../Index.md) - -Style -===== - -Naming ------- - -* Types are always in PascalCase (KernlExecCuda, BufT, ...) and singular. -* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else. -* Namespaces are always in lowercase and singular is preferred. -* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable. - - -Types ------ - -* Always use integral types with known width (`int32_t`, `uin64_t`, ...). -Never use `int`, `unisgned long`, etc. - - -Type Qualifiers ---------------------- - -The order of type qualifiers should be: -```Type const * const``` for a const pointer to a const Type. -```Type const &``` for a reference to a const Type. - -The reason is that types can be read from right to left correctly without jumping back and forth. -```const Type * const``` and ```const Type &``` would require jumping in either way to read them correctly. - - -Variables ---------- - -* Variables should always be initialized on construction because this can produce hard to debug errors. -This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style. -* Variables should (nearly) always be `const` to make the code more easy to understand. -This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM. -This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers. -* Variable definitions should be differentiated from assignments by using either `(...)` or `{...}` but never `=` for definitions. -Use `uint32_t const iUsageOfThisVariable(42);` instead of `uint32_t const iUsageOfThisVariable = 42;` - - -Comments --------- - -* Always use C++-Style comments `//` -* For types use `//#############################################################################` to start the comment block. -* For functions use `//-----------------------------------------------------------------------------` to start the comment block. -* Never write comments for closing braces (namespaces, classes, etc ...) - - -Braces ------- - -* Braces (opening and closing) for classes, structs, functions, namespaces, etc. appear on a new line. Exception: If the function or class body is empty, the opening and closing braces are on the same (next) line. -* Only braces for variable initialization can appear in-line. - - -Indentation ------------ - -* Always indent everything by *one level* (namespace body, class members, function body, ...) -* Do not use more indentation e.g. to align function parameters. - - -Spaces ------- - -* Trailing white-spaces are forbidden. -* There is no space between keywords (if, for, ...) and the opening parenthesis. -* There is no space after the opening `(` or `<` and before the closing `)` `>`. -* There is a space before and after binary operators (=, *, +, ...) -* There is no space after the unary operators !, ~, ... - - -Functions ---------- - -* Always use the trailing return type syntax with the return type on a new line even if the return type is void: -```C++ -auto func() --> bool -``` - * This makes it easier to see the return type because it is on its own line. - * This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions. - -* Each function parameter is on a new indented line: -```C++ -auto func( - float f1, - float f2) --> bool -{ - return true -} -``` -```C++ -func( - 1.0f, - 2.0f); -``` - * Makes it easier to see how many parameters there are and which position they have. - - -Templates ---------- - -* Template parameters are prefixed with `T` to differentiate them from class or function local typedefs. - -* Each template parameter is on a new indented line: -```C++ -template< - typename TParam, - typename TArgs...> -auto func() --> bool -``` - * Makes it easier to see how many template parameters there are and which position they have. - -* Always use ```typename``` for template parameters. There is NO difference to class and typename matches the intent better. - - -Traits ------- - -* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization: -```C++ -template< - typename T, - typename TSfinae = void> -struct GetOffsets; -``` - -* Template trait aliases always end with a `T` e.g. `BufT` while the corresponding trait ends with `Type` e.g. `BufType` - -* Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: `sin(){...}` and `Sin{sin(){...}};` - -Includes --------- - -* The order of includes is from the most specialized header to the most general one. -This order helps to find missing includes in more specialized headers because the general ones are always included afterwards. - -* A comment with the types or functions included by a include file make it easier to find out why a special header is included. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md b/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md deleted file mode 100644 index 3ba3199449..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md +++ /dev/null @@ -1,131 +0,0 @@ -[:arrow_up: Up](../Index.md) - -Abstraction -=========== - - - -Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators. -Within this chapter an abstraction will be derivated that tries to provide a maximum of parallelism while simultaneously considering implementability and applicability in hardware. - -Looking at the current HPC hardware landscape, we often see nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi, each with their own processing units. -Within a CPU or a Intel Xeon Phi there are cores with hyper-threads, vector units and a large caching infrastructure. -Within a GPU there are many small cores and only few caches. -Each entity in the hierarchy has access to different memories. -For example, each socket / processor manages its RAM, while the cores additionally have non-explicit access to L3, L2 and L1 caches. -On a GPU there are global, constant, shared and other memory types which all can be accessed explicitly. -The interface has to abstract from these differences without sacrificing speed on any platform. - -A process running on a multi-socket node is the largest entity within *alpaka*. -The abstraction is only about the task and data parallel execution on the process/node level and down. -It does not provide any primitives for inter-node communication. -However, such libraries can be combined with *alpaka*. - -An application process always has a main thread and is by definition running on the host. -It can access the host memory and various accelerator devices. -Such accelerators can be GPUs, Intel Xeon Phis, the host itself or other devices. -Thus, the host not necessarily has to be different from the accelerator device used for the computations. -For instance, an Intel Xeon Phi simultaneously can be the host and the accelerator device. - -The *alpaka* library can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices. - -Task Parallelism ----------------- - -One of the basic building blocks of modern applications is task parallelism. -For example, the operating system scheduler, deciding which thread of which process gets how many processing time on which CPU core, enables task parallelism of applications. -It controls the execution of different tasks on different processing units. -Such task parallelism can be, for instance, the output of the progress in parallel to a download. -This can be implemented via two threads executing two different tasks. - -The valid dependencies between tasks within an application can be defined as a DAG (directed acyclic graph) in all cases. -The tasks are represented by nodes and the dependencies by edges. -In this model, a task is ready to be executed if the number of incoming edges is zero. -After a task finished it's work, it is removed from the graph as well as all of it's outgoing edges,. -This reduces the number of incoming edges of subsequent tasks. - -The problem with this model is the inherent overhead and the missing hardware and API support. -When it is directly implemented as a graph, at least all depending tasks have to be updated and checked if they are ready to be executed after a task finished. -Depending on the size of the graph and the number of edges this can be a huge overhead. - -*OpenCL* allows to define a task graph in a somewhat different way. -Tasks can be enqueued into an out-of-order command queue combined with events that have to be finished before the newly enqueued task can be started. -Tasks in the command queue with unmet dependencies are skipped and subsequent ones are executed. -The `CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE` property of a command queue is an optional feature only supported by few vendors. -Therefore, it can not be assumed to be available on all systems. - -*CUDA* on the other hand does currently (version 7.5) not support such out-of-order queues in any way. -The user has to define dependencies explicitly through the order the tasks are enqueued into the queues (called queues in *CUDA*). -Within a queue, tasks are always executed in sequential order, while multiple queues are executed in parallel. -Queues can wait for events enqueued into other queues. - -In both APIs, *OpenCL* and *CUDA*, a task graph can be emulated by creating one queue per task and enqueuing a unique event after each task, which can be used to wait for the preceding task. -However, this is not feasible due to the large queue and event creation costs as well as other overheads within this process. - -Therefore, to be compatible with a wide range of APIs, the interface for task parallelism has to be constrained. -Instead of a general DAG, multiple queues of sequentially executed tasks will be used to describe task parallelism. -Events that can be enqueued into the queues enhance the basic task parallelism by enabling synchronization between different queues, devices or the host threads. - -Data Parallelism ----------------- - -In contrast to task parallelism, data parallelism describes the execution of one and the same task on multiple, often related data elements. -For example, an image color space conversion is a textbook example of a data parallel task. -The same operation is executed independently on each pixel. -Other data parallel algorithms additionally introduce dependencies between threads in the input-, intermediate-, or output-data. -For example, the calculation of a brightness histogram has no input-data dependencies. -However, all pixel brightness values finally have to be merged into a single result. -Even these two simple examples show that it is necessary to think about the interaction of parallel entities to minimize the influence of data dependencies. - -Furthermore, it is necessary to respect the principles of spatial and temporal locality. -Current hardware is built around these locality principles to reduce latency by using hierarchical memory as a trade-off between speed and hardware size. -Multiple levels of caches, from small and very fast ones to very large and slower ones exploit temporal locality by keeping recently referenced data as close to the actual processing units as possible. -Spatial locality in the main memory is also important for caches because they are usually divided into multiple lines that can only be exchanged one cache line at a time. -If one data element is loaded and cached, it is highly likely that nearby elements are also cached. -If the pixels of an image are stored row wise but are read out column wise, the spatial locality assumption of many CPUs is violated and the performance suffers. -GPUs on the other hand do not have a large caching hierarchy but allow explicit access to a fast memory shared across multiple cores. -Therefore, the best way to process individual data elements of a data parallel task is dependent on the data structure as well as the underlying hardware. - -The main part of the *alpaka* abstraction is the way it abstracts data parallelism and allows the algorithm writer to take into account the hierarchy of processing units, their data parallel features and corresponding memory regions. -The abstraction developed is influenced and based on the groundbreaking *CUDA* and *OpenCL* abstractions of a multidimensional grid of threads with additional hierarchy levels in between. -Another level of parallelism is added to those abstractions to unify the data parallel capabilities of modern hardware architectures. -The explicit access to all hierarchy levels enables the user to write code that runs performant on all current platforms. -However, the abstraction does not try to automatically optimize memory accesses or data structures but gives the user full freedom to use data structures matching the underlying hardware preferences. - -The individual levels are explained on the following pages: - -1. [Thread](abstraction/Thread.md) -2. [Block](abstraction/Block.md) -3. [Warp](abstraction/Warp.md) -4. [Element](abstraction/Element.md) - -Summary -------- - -This abstraction is called *Redundant Hierarchical Parallelism*. -This term is inspired by the paper *The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?* [PDF](http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf) [DOI](http://dx.doi.org/10.1109/ICPADS.2013.76). -It investigates a similar *concept of copious parallel programming* reaching 80%-90% of the native performance while comparing CPU and GPU centric versions of an *OpenCL* n-body simulation with a general version utilizing parallelism on multiple hierarchy levels. - -The *CUDA* or *OpenCL* abstractions themselves are very similar to the one designed in the previous sections and consists of all but the Element level. -However, as has been shown, all five abstraction hierarchy levels are necessary to fully utilize current architectures. -By emulating unsupported or ignoring redundant levels of parallelism, algorithms written with this abstraction can always be mapped optimally to all supported accelerators. The following table summarizes the characteristics of the proposed hierarchy levels. - -| Hierarchy Level | Parallelism | Synchronizable | -| --- | --- | --- | -| grid | sequential / parallel | :x: / :white_check_mark: | -| block | parallel | :x: | -| warp | parallel | :white_check_mark: | -| thread | parallel / lock-step| :white_check_mark: | -| element | sequential | :x: | - -Depending on the queue a task is enqueued into, grids will either run in sequential order within the same queue or in parallel in different queues. -They can be synchronized by using events. -Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device. -Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain. -Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially. - diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md b/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md deleted file mode 100644 index 6dd7824082..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md +++ /dev/null @@ -1,10 +0,0 @@ -[:arrow_up: Up](../Index.md) - -Implementation -============== - -The implementation of the library in C++, especially the way C++11 allows to define the abstract concepts and to take advantage of the zero-overhead compile-time polymorphism is explained in this section. -Furthermore, it is described how the abstraction can be mapped to real devices. - -1. [Library Interface](implementation/Library.md) -2. [Mapping onto Specific Hardware Architectures](implementation/Mapping.md) diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md b/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md deleted file mode 100644 index 87078731ad..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md +++ /dev/null @@ -1,326 +0,0 @@ -[:arrow_up: Up](../Index.md) - -Introduction -============ - -The *alpaka* library defines and implements an abstract interface for the *hierarchical redundant parallelism* model. -This model exploits task- and data-parallelism as well as memory hierarchies at all levels of current multi-core architectures. -This allows to achieve portability of performant codes across various types of accelerators by ignoring specific unsupported levels and utilizing only the ones supported on a specific accelerator. -All hardware types (multi- and many-core CPUs, GPUs and other accelerators) are treated and can be programmed in the same way. -The *alpaka* library provides back-ends for *CUDA*, *OpenMP*, *Boost.Fiber* and other methods. -The policy-based C++ template interface provided allows for straightforward user-defined extension of the library to support other accelerators. - -The library name *alpaka* is an acronym standing for **A**bstraction **L**ibrary for **Pa**rallel **K**ernel **A**cceleration. - - -Motivation ----------- - -What scales well on current hardware does not necessarily scale well on future architectures. -The hardware landscape is always changing. -In the past the big clusters have been CPU only. -Today we see a change to accelerator supported computing. -For example, GPUs, Intel Xeon Phis or other special purpose extension cards are extensively used. -It is unpredictable what the next big step will be and how the Exaflop hardware will look like. -It is not clear that GPUs will always be the best platform. -Nevertheless, the underlying physical algorithms as well as the need for heterogeneous architectures will not change. - -Current highly parallel GPUs are optimized for throughput and hide latency and data dependencies by always keeping a ready pool of work. -This allows to sustain the performance at a high percent of peak. -CPUs in turn are designed to optimize the execution time of a single thread. -Features like branch prediction, speculative execution, register renaming and many more *[...] would cost far too much energy to be replicated for thousands of parallel GPU threads but [...] are entirely appropriate for CPUs.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159)) -Even more specialized architectures will appear and find their way into HPC. - -*The essence of the heterogeneous computing model is that one size does not fit all. Parallel and serial segments of the workload execute on the best-suited processor delivering faster overall performance, greater efficiency, and lower energy and cost per unit of computation.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159)) - -New hardware will not only allow to execute faster or calculate more but will furthermore enable the usage of new algorithms for more precise simulations. -For example, some tasks may require random searches for only a few values in a lookup table of up to hundreds of gigabytes. -This would perfectly fit to a CPUs, while the rest of the simulation would still be running on the GPUs. -With new hardware bringing those two worlds closer together, exploiting the heterogeneous hardware with heterogenous algorithms will likely be the way to go in the future. -Being able to express both of those parallel tasks in the same way would greatly enhance the productivity of the programmer and the clarity of the code. - -Porting a complicated simulation code from *CUDA* to x86 and possibly to other hardware architectures is a non-trivial task. -A lot of developer time could be saved if this task would not have to be done repeatedly for every new hardware, but rather only once. -Therefore, *alpaka* tries to solve the problems in porting highly scalable simulation codes on various multi-core architectures. - - -Problems in Porting Performant HPC Codes ----------- - -Porting a highly performant code to a new architecture is a non-trivial task that poses many problems. -Often it is a requirement to keep the simulation operative on the previous platform as well. -This means that multiple hardware platforms have to be supported simultaneously. -A great number of projects take the route that seems easiest at first and simply duplicate all the parallel algorithms and port them to the new back-end. -All the specific API functions that have been used, have to be supplemented by the new pendants, possibly guarded by preprocessor macros to switch between the old and the new version. -A switch of the back-end used in a simulation, for example, from *OpenMP* to *CUDA* often requires a near rewrite. -Each newly supported platform would have to duplicate the API specific kernel and invocation code lines. - -The following paragraphs will summarize problems that arise when performant HPC codes have to be ported: - -### Sustainability -Because the underlying HPC hardware is constantly changing, every new generation will require an adaption of the simulation. -Even to deliver the performance reached on previous architectures is a tough task for programmers. -Furthermore, nobody can guarantee the lifespan of the parallelization technique used. -*OpenMP*, *CUDA*, *OpenACC* and all the other possibilities could be discontinued or get deprecated for any reason at any time. -Therefore, an abstract interface is required that hides the particular back-end and allows to port the interface implementation and not the application using the interface itself. - -### Heterogeneity -Some parts of a simulation perfectly map to current GPUs while other parts are better computed on CPUs or other accelerators. -Furthermore, by letting one part of the heterogeneous cluster hardware idle, a lot of computing power is wasted. -It is essential, especially for future architectures, that those resources are utilized to reach the peak performance of the systems. -This heterogeneous work division not only depends on the architecture but also on the number of available hardware resources, the workload and many other factors. -Therefore, to reach good scaling across a multitude of systems, it is necessary to be able to dynamically decide where to execute which part of the simulation either at make-time, compile-time or at run-time. -Currently this requires to duplicate the kernels and write specific implementations per back-end. -Many projects only allow to switch the back-end of the whole simulation at once or possibly even per kernel at make-time. -This will not be enough on future architectures where the ability to mix the back-ends is required to optimally utilize different cluster architectures or to dynamically load balance tasks across a diverse set of (possibly failing) accelerator devices. -Therefore, an abstract interface unifying the abilities of all the back-ends is required to let the application express parallelism of the different back-ends in a unified algorithm that can then be mapped to the device currently in use. - -### Maintainability -Looking at the software engineering aspects, duplication is a bad solution because this leads to maintainability issues. -In many projects such copies result in a large growth in the number of lines of code while only minimal new functionality is implemented. -Most of the new code only executes things that have already been implemented for the initial platform. -Developers having to change one of the algorithms additionally have to change all duplicates for all other back-ends. -Depending on the similarity of the implementations, this can result in a doubling / multiplication of developer efforts in the worst-case scenario. -Especially for open-source projects that rely on contributions from the community this raises the hurdle for new developers because they have to know not only one, but multiple different parallelization libraries. -In the end good maintainability is what keeps a software project alive and what ensures a steady development progress. -Therefore, an interface hiding the differences between all the back-ends is required to let the application express parallelism in a unified algorithm. - -### Testability -Code duplication, being the easiest way to port a simulation, exacerbates testing. -Each new kernel has to be tested separately because different bugs could have been introduced into the distinct implementations. -If the versions can be mixed, it is even harder because all combinations have to be tested. -Often the tests (continuous integration tests, unit tests, etc.) have to run on a special testing hardware or on the production systems due to the reliance on the availability of special accelerators. -For example, *CUDA* compile tests are possible without appropriate hardware but it is not feasible to execute even simple runtime tests due to the missing CPU emulation support. -An interface allowing to switch between acceleration back-ends, which are tested for compatibility among each other, enables easy testing on development and test systems. - -### Optimizability -Even if the simulation code has encapsulated the APIs used, the optimal way to write performant algorithms often differs between distinct parallelization frameworks. -It is necessary to allow the user to fine-tune the algorithm to run optimally on each different accelerator device by compile time specialization or policy based abstractions without the need to duplicate the kernel. -Within the kernel there has to be knowledge about the underlying platform to adaptively use data structures that map optimally onto the current architecture. -To ease this optimization work, libraries with data structures, communication patterns and other things hiding the differences between back-ends have to be implemented. -This would allow to optimize the interface implementation and not the simulation itself. - -In summary, it can be stated that all the portability problems of current HPC codes could be solved by introducing an abstract interface that hides the particular back-end implementations and unifies the way to access the parallelism available on modern many-core architectures. - - -Similar Projects ----------------- - -There are multiple other libraries targeting the (portable) parallel task execution within nodes. -Some of them require language extensions, others pretend to achieve full performance portability across a multitude of devices. -But none of these libraries can provide full control over the (possibly diverse) underlying hardware while being only minimal invasive. -There is always a productivity-performance trade-off. - -Furthermore, many of the libraries do not satisfy the requirement for full single-source C++ support. -This is essential because many simulation codes heavily rely on template meta-programming for method specialization and compile time optimizations. - - -### CUDA - Compute Unified Device Architecture - -*CUDA* is a parallel computing platform and programming model developed by *NVIDIA*. -It is used in science and research as well as in consumer software to compute highly parallel workloads on GPUs starting from image and video editing up to simulations on high-performance computers. -Such usage of graphics processing units not only for computer graphics, but also for tasks that have traditionally been handled by the CPU is called GPGPU (general-purpose computing on graphics processing units). -A disadvantage of *CUDA* is that its application is bound to the usage of *NVIDIA* GPUs. -Currently no other vendors provide accelerators that support *CUDA*. -Additionally there is no supported free emulator allowing to execute *CUDA* code on CPUs. - -The *CUDA* API is a higher level part of the programming model which allows to access and execute code on GPUs from multiple host languages including C++. -The *CUDA* C/C++ language on the other hand is a mid level construct based on standard C++ with some extensions for accelerator programming and limitations in the supported constructs. -For example, throwing and catching exceptions as well as run-time type information (RTTI) are not supported. -*CUDA* C/C++ is compiled to a low level virtual instruction set called PTX (Parallel Thread Execution). -The PTX code is later compiled to assembler code by the GPU driver. - -*NVIDIA* provides an extended C++ compiler based on the LLVM clang compiler called nvcc that allows to mix host C++ code using the *CUDA* API with *CUDA* C/C++. -The host part of the C++ code is compiled by the respective host system compiler (gcc, icc, clang, MSVC) while the GPU device code is separately compiled to PTX. -After the compilation steps both binaries are linked together to form the final assembly. - -*CUDA* defines a heterogeneous programming model where tasks are offloaded from the host CPU to the device GPU. -Functions that should be offloaded to the GPU are called kernels. -As can be seen in the figure below a grid of such kernels is executed in parallel by multiple threads organized in blocks. -Threads within a block can synchronize, while blocks are executed independently and possibly in sequential order depending on the underlying hardware. -![grid-of-thread-blocks](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png) - -The global device memory is the slowest but largest memory accessible by all threads. -It can be accessed from host code via methods provided by the *CUDA* API. -Global memory is persistent across kernel invocations. -Threads within a block can communicate through a fast but small shared memory. -Each thread has a set of very low latency registers similar to CPU threads. -Additionally there are special purpose memory sections for constant and texture data. - -The *CUDA* C/C++ language gives full control over memory, caches and the execution of kernels. - - -### [PGI CUDA-X86](https://www.pgroup.com/resources/cuda-x86.htm) -is a compiler technology that allows to generate x86-64 binary code from *CUDA* C/C++ applications using the *CUDA Runtime API* but does not support the *CUDA Driver API*. -At run-time *CUDA* C programs compiled for x86 execute each *CUDA* thread block using a single host core, eliminating synchronization where possible. -Multiple kernel threads are combined to be executed together via the CPUs SIMD (Single Instruction Multiple Data) capabilities for vectorized execution. -The *PGI Unified Binary technology* allows to create a single binary that uses *NVIDIA* GPUs when available, or runs on multi-core CPUs else. -The compiler is not always up-to-date with the latest *CUDA* versions and is not available for free. -Furthermore, the compiler seems not to be developed actively since *NVIDIA* acquired *PGI* in 2013. -Since 2012 no news were published and nothing could be found in the yearly release notes of the *PGI* compiler suite. - - -### [GPU Ocelot](http://gpuocelot.gatech.edu/) - -is an open-source dynamic JIT compilation framework. -It allows to execute native *CUDA* binaries by dynamically translating the *NVIDIA PTX* virtual instruction set architecture to other instruction sets. -It supports *NVIDIA* and *AMD* GPUs as well as multicore CPUs via a PTX to LLVM (Low Level Virtual Machine) translator. -The project is not in active development anymore. -It only supports PTX up to version 3.1 (current version is 5.0). - - -### [OpenMP](http://openmp.org//) -is an open specification for vendor agnostic shared memory parallelization. -By adding annotations (pragmas in C/C++) to loops or regions, it allows to easily parallelize existing sequential C/C++/Fortran code in an incremental manner. -Due to the nature of pragmas, these hints are ignored if the compiler does not support them or thinks they are inappropriate. -This allows those programs to be compiled as sequential or parallel versions by only changing a compiler flag. -In C/C++ the syntax for *OpenMP* directives is `#pragma omp` followed by multiple clauses. -For example, with the directive `#pragma omp parallel for`, the compiler will automatically distribute the iterations of the directly following loop across the available cores. -*OpenMP* 4.0 introduced support for offloading computations to accelerator devices, substantially improved the task support and extended the SIMD capabilities. -By embedding code within a `#pragma omp target` block, the contained code will be executed on the selected device. -*OpenMP* 4.0 is missing the ability for unstructured data movement and only implements structured data movement from and to devices. -The compiler directive `#pragma omp target data map(...) ...` at the begin of a code block will define which data is copied to, copied back from and is created on the device. -At the end of the code block the memory is copied back or gets deleted. -There is no way to allocate device memory that is persistent between kernel calls in different methods because it is not possible to create a device data region spanning both functions in the general case. -*OpenMP* 4.1, expected for the end of 2015, is likely to introduce `#pragma omp target enter data`, `#pragma omp target exit data` and other unstructured data movement directives that allow to pass and obtain pointers of already resident memory to and from offloaded kernels. -Currently *OpenMP* does not provide a way to control the hierarchical memory because its main assumption is a shared memory for all threads. -Therefore, the block shared memory on *CUDA* devices can not be explicitly utilized. - - -### [OpenACC](http://www.openacc-standard.org/) -is a pragma based programming standard for heterogeneous computing. -It is very similar to *OpenMP* and provides annotations for parallel execution and data movement as well as run-time functions for accelerator and device management. -In contrast to *OpenMP* it allows limited access to *CUDA* block shared memory. -Current compiler implementations support *NVIDA*, *AMD* and *Intel* accelerators. -Only as of *OpenACC* 2.0 explicit memory management and tiling is supported. -*OpenACC* does not support dynamic allocation of memory (`new`, `delete`) in kernel code. -It is aimed to be fully merged with *OpenMP* at some point, but for now *OpenMP* 4.0 only introduced some parts of it. - - -### [OpenCL](https://www.khronos.org/opencl/) -is a programming framework for heterogeneous platforms. -It is fully hardware independent and can utilize CPUs and GPUs of nearly all vendors. -This is achieved by compiling the *OpenCL* kernel code (or the standardized *SPIR* intermediate representation) at run-time by the platform driver into the native instruction set. -Versions prior to 2.1 (released in March 2015) did only support a C-like kernel language. -Version 2.1 introduced a subset of C++14. -*OpenCL* does not support single-source programming (combining C++ host code and accelerator code in a single file). -This is a precondition for templated kernels which are required for policy based generic programming. -It is necessary to note that *NVIDIA* seems to neglect their *OpenCL* implementation. -Support for version 1.2 has just been added in April 2015 after only three and a half years after the publication of the standard. -*OpenCL* does not support dynamic allocation of memory (`new`, `delete`) in kernel code. - - -### [SYCL](https://www.khronos.org/sycl/) -is a cross-platform abstraction layer based on *OpenCL*. -The main advantage over *OpenCL* itself is that it allows to write single-source heterogeneous programs. -It enables the usage of a single C++ template function for host and device code. -As of now there is no usable free compiler implementation available that has good support for multiple accelerator devices. - - -### [C++ AMP (Accelerated Massive Parallelism)](https://msdn.microsoft.com/en-us/library/hh265136.aspx) -is an open specification from *Microsoft* currently implemented on top of *DirectX 11*. -It is a language extension requiring compiler support that allows to annotate C++ code that can then be run on multiple accelerators. -*C++ AMP* requires the usage of the `array` data structure or the `array_view` wrapper responsible for copying data to and from the accelerator devices. -The `parallel_for_each` function is responsible for offloading the provided function object whose `operator()` has to be annotated with `restrict(amp)`. -The threads can access shared memory and synchronize. -The range of supported accelerator devices, plaforms and compilers is currently very limited. - - -### [KOKKOS](https://github.com/kokkos) - -provides an abstract interface for portable, performant shared memory-programming. -It is a C++ library that offers `parallel_for`, `parallel_reduce` and similar functions for describing the pattern of the parallel tasks. -The execution policy determines how the threads are executed. -For example, this influences the sizes of blocks of threads or if static or dynamic scheduling should be used. -The library abstracts the kernel as a function object that can not have any user defined parameters for its `operator()`. -Inconveniently, arguments have to be stored in members of the function object coupling algorithm and data together. -*KOKKOS* provides both, abstractions for parallel execution of code and data management. -Multidimensional arrays with a neutral indexing and an architecture dependent layout are available, which can be used, for example, to abstract the underlying hardwares preferred memory access scheme that could be row-major, column-major or even blocked. - - -### [Thrust](https://thrust.github.io/) -is a parallel algorithms library resembling the C++ Standard Template Library (STL). -It allows to select either the *CUDA*, *TBB* or *OpenMP* back-end at make-time. -Because it is based on generic `host_vector` and `device_vector` container objects, it is tightly coupling the data structure and the parallelization strategy. -There exist many similar libraries such as [ArrayFire](http://www.arrayfire.com/) (*CUDA*, *OpenCL*, native C++), [VexCL](https://github.com/ddemidov/vexcl/) (*OpenCL*, *CUDA*), [ViennaCL](http://viennacl.sourceforge.net/) (*OpenCL*, *CUDA*, *OpenMP*) and [hemi](https://github.com/harrism/hemi/) (*CUDA*, native C++). - - - -Distinction of the *alpaka* Library ------------------------------------------- - -In the section about the problems we saw that all portability problems of current HPC codes could be solved with an abstract interface unifying the underlying accelerator back-ends. -The previous section showed that there is currently no project available that could solve all of the problems highlighted. -The C++ interface library proposed to solve all those problems is called *alpaka*. -The subsequent enumeration will summarize the purpose of the library: - -### *alpaka* is ... -* an **abstract interface** describing parallel execution on multiple hierarchy levels. It allows to implement a mapping to various hardware architectures but **is no optimal mapping itself**. - -* sustainably solving portability (50% on the way to reach full performance portability) - -* solving the **heterogeneity** problem. An identical algorithm / kernel can be executed on heterogeneous parallel systems by selecting the target device. - -* reducing the **maintainability** burden by not requiring to duplicate all the parts of the simulation that are directly facing the parallelization framework. Instead, it allows to provide a single version of the algorithm / kernel that can be used by all back-ends. All the accelerator dependent implementation details are hidden within the *alpaka* library. - -* simplifying the **testability** by enabling **easy back-end switching**. No special hardware is required for testing the kernels. Even if the simulation itself will always use the *CUDA* back-end, the tests can completely run on a CPU. As long as the *alpaka* library is thoroughly tested for compatibility between the acceleration back-ends, the user simulation code is guaranteed to generate identical results (ignoring rounding errors / non-determinism) and is portable without any changes. - -* **optimizable**. Everything in *alpaka* can be replaced by user code to optimize for special use-cases. - -* **extensible**. Every concept described by the *alpaka* abstraction can be implemented by users. Therefore it is possible to non-intrusively define new devices, queues, buffer types or even whole accelerator back-ends. - -* **data structure agnostic**. The user can use and define arbitrary data structures. - -### *alpaka* is not ... - -* an automatically **optimal mapping** of algorithms / kernels to various acceleration platforms. Except in trivial examples an optimal execution always depends on suitable selected data structure. An adaptive selection of data structures is a separate topic that has to be implemented in a distinct library. - -* automatically **optimizing concurrent data accesses**. - -* **handling** or hiding differences in arithmetic operations. For example, due to **different rounding** or different implementations of floating point operations, results can differ slightly between accelerators. - -* **guaranteeing any determinism** of results. Due to the freedom of the library to reorder or repartition the threads within the tasks it is not possible or even desired to preserve deterministic results. For example, the non-associativity of floating point operations give non-deterministic results within and across accelerators. - -The *alpaka* library is aimed at parallelization within nodes of a cluster. -It does not compete with libraries for distribution of processes across nodes and communication among those. -For these purposes libraries like MPI (Message Passing Interface) or others should be used. -MPI is situated one layer higher and can be combined with *alpaka* to facilitate the hardware of a whole heterogeneous cluster. -The *alpaka* library can be used for parallelization within nodes, MPI for parallelization across nodes. - - -Comparison ----------- - -The following table summarizes which of the problems mentioned in section about the problems can be solved by current intra-node parallelization frameworks and the proof-of-concept *alpaka* abstraction library. - -| Framework / API | Open-Source | Free | Single-Source C++ | Portability | Heterogenity | Maintainability | Testability | Optimizability | Data structure agnostic | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| CUDA | :x: | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: | :x: | :white_check_mark: | :white_check_mark: | -| PGI CUDA-x86 | :x: | :x: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| GPU Ocelot | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| OpenMP | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| OpenACC | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| OpenCL | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| SYCL | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: | -| C++AMP | :white_check_mark: | :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| KOKKOS | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :large_orange_diamond: | -| Thrust | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x: | :x: | -| **alpaka** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | - -Properties of intra-node parallelization frameworks and their ability to solve the problems in porting performant HPC codes. :white_check_mark: : yes / fully solved, :large_orange_diamond: : partially solved, :x: : no / not solved diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md deleted file mode 100644 index db4db63188..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md +++ /dev/null @@ -1,34 +0,0 @@ -[:arrow_up: Up](../Abstraction.md) - -Block -===== - -Building a processor with possibly thousands of cores where all cores have an equal length connection for fast communication and synchronization is not viable. -Either the processor size would have to grow exponentially with the number of cores or the all-to-all communication speed would decrease so much that computations on the processor would be impractical. -Therefore, the communication and synchronization of threads has to be limited to sizes manageable by real hardware. - -Figure \ref{fig:block} depicts the solution of introducing a new hierarchy level in the abstraction. -A hypothetical processor is allowed to provide synchronization and fast communication within blocks of threads but is not required to provide synchronization across blocks. -The whole grid is subdivided into equal sized blocks with a fast but small shared memory. -Current accelerator abstractions (*CUDA* and *OpenCL*) only support equal sized blocks. -This restriction could possibly be lifted to support future accelerators with heterogeneous block sizes. -![block](block/block.png) - -There is another reason why independent blocks are necessary. -Threads that can communicate and synchronize require either a one-to-one mapping of threads to cores, which is impossible because the number of data elements is theoretically unlimited, or at least a space to store the state of each thread. -Even old single core CPUs were able to execute many communicating and synchronizing threads by using cooperative or preemptive multitasking. -Therefore, one might think that a single core would be enough to execute all the data parallel threads. -But the problem is that even storing the set of registers and local data of all the possible millions of threads of a task grid is not always viable. -The blocking scheme solves this by enabling fast interaction of threads on a local scale but additionally removes the necessity to store the state of all threads in the grid at once because only threads within a block must be executed in parallel. -Within a block of cores there still has to be enough memory to store all registers of all contained threads. -The independence of blocks allows applications to scale well across diverse devices. -As can be seen in the following figure, the accelerator can assign blocks of the task grid to blocks of cores in arbitrary order depending on availability and workload. -![block_scale](block/block_scale.png) - -Shared Memory -------------- - -Each block has its own shared memory. -This memory can only be accessed explicitly by threads within the same block and gets discarded after the complete block finished its calculation. -This memory is typically very fast but also very small. -No variables are shared between kernels by default. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md deleted file mode 100644 index d89c0b7e19..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md +++ /dev/null @@ -1,42 +0,0 @@ -[:arrow_up: Up](../Abstraction.md) - -Element -======= - -To use the maximum available computing power of, for example, a modern x86 processor, the computation has to utilize the SIMD vector registers. -Many current architectures support issuing a single instruction that can be applied to multiple data elements in parallel. - -The original x86 instruction set architecture did not support SIMD instructions but has been enhanced with MMX (64 bit width registers), SSE (128 bit width registers), AVX (256 bit width registers) and AVX-512 (512 bit width registers) extensions. -In varying degree, they allow to process multiple 32 bit and 64 bit floating point numbers as well as 8, 16, 32 and 64 bit signed and unsigned integers. - -*CUDA* capable GPUs do not have vector registers where multiple values of type `float` or `double` can be manipulated by one instruction. -Nevertheless, newer *CUDA* capable devices implement basic SIMD instructions on pairs of 16 bit values and quads of 8-bit values. -They are described in the documentation of the [PTX instruction set architecture](http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#axzz4OTzGGwcJ) chapter 8.7.13 but are only of any use in very special problem domains, for example for deep learning. - -It would be optimal if the compiler could automatically vectorize our kernels when they are called in a loop and vectorization is supported by the underlying accelerator. -However, besides full blown vector processors, mainstream CPUs do not support predicated execution or similar complex things within vector registers. -At most, there is support for masking operations which allow to emulate at least some conditional branching. -Therefore, this missing hardware capability has to be circumvented by the compiler. -There are scientific research projects such as the work done by Ralf Karrenberg et al [1](http://www.cdl.uni-saarland.de/publications/theses/karrenberg_msc.pdf) [2](http://www.cdl.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf) [3](http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf) building on the *LLVM* compiler infrastructure supporting such whole-function vectorization. -However, current mainstream compilers do not support automatic vectorization of basic, non trivial loops containing control flow statements (`if`, `else`, `for`, etc.) or other non-trivial memory operations. -Therefore, it has to be made easier for the compiler to recognize the vectorization possibilities by making it more explicit. - -The opposite of automatic whole function vectorization is the fully explicit vectorization of expressions via compiler intrinsics directly resulting in the desired assembly instruction. -A big problem when trying to utilize fully explicit vectorization is, that there is no common foundation supported by all explicit vectorization methods. -A wrapper unifying the x86 SIMD intrinsics found in the `intrin.h` or `x86intrin.h` headers with those supported on other platforms, for example ARM NEON (`arm_neon.h`), PowerPC Altivec (`altivec.h`) or *CUDA* is not available and to write one is a huge task in itself. -However, if this would become available in the future, it could easily be integrated into *alpaka* kernels. - -Due to current compilers being unable to vectorize whole functions and the explicit vectorization intrinsics not being portable, one has to rely on the vectorization capabilities of current compilers for primitive loops only consisting of a few computations. -By creating a grid of data elements, where multiple elements are processed per thread and threads are pooled in independent blocks, as it is shown in the figure below, the user is free to loop sequentially over the elements or to use vectorization for selected expressions within the kernel. -Even the sequential processing of multiple elements per thread can be useful depending on the architecture. -For example, the *NVIDIA cuBLAS* general matrix-matrix multiplication (GEMM) internally executes only one thread for each second matrix data element to better utilize the registers available per thread. -![element](element/element.png) - - \ No newline at end of file diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md deleted file mode 100644 index a684001f10..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md +++ /dev/null @@ -1,38 +0,0 @@ -[:arrow_up: Up](../Abstraction.md) - -Thread -====== - -Theoretically, a basic data parallel task can be executed optimally by executing one thread per independent data element. -In this context, the term thread does not correspond to a native kernel-thread, an *OpenMP* thread, a *CUDA* thread, a user-level thread or any other such threading variant. -It only represents the execution of a sequence of commands forming the desired algorithm on a per data element level. -This ideal one-to-one mapping of data elements to threads leads to the execution of a multidimensional grid of threads corresponding to the data structure of the underlying problem. -The uniform function executed by each of the threads is called a kernel. -Some algorithms such as reductions require the possibility to synchronize or communicate between threads to calculate a correct result in a time optimal manner. -Therefore our basic abstraction requires a n-dimensional grid of synchronizable threads each executing the same kernel. -The following figure shows an hypothetical processing unit that could optimally execute this data parallel task. -The threads are mapped one-to-one to the cores of the processor. -For a time optimal execution, the cores have to have an all-to-all equal length connection for communication and synchronization. -![thread](thread/thread.png) - -The only difference between the threads is their positional index into the grid which allows each thread to compute a different part of the solution. -Threads can always access their private registers and the global memory. - -Registers ---------- - -All variables with default scope within a kernel are automatically saved in registers and are not shared automatically. -This memory is local to each thread and can not be accessed by other threads. - -Global Memory -------------- - -The global memory can be accessed from every thread in the grid as well as from the host thread. -This is typically the largest but also the slowest memory available. - -Individual threads within the grid are allowed to statically or dynamically allocate buffers in the global memory. - -Prior to the execution of a task, the host thread copies the input buffers and allocates the output buffers onto the accelerator device. -Pointers to these buffers then can be given as arguments to the task invocation. -By using the index of each thread within the grid, the offset into the global input and output buffers can be calculated. -After the computation has finished, the output buffer can be used either as input to a subsequent task or can be copied back to the host. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md deleted file mode 100644 index c5b05df768..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md +++ /dev/null @@ -1,29 +0,0 @@ -[:arrow_up: Up](../Abstraction.md) - -Warp -==== - -With the current abstraction only independent parallelism via blocks and synchronizable parallelism via threads can be expressed. -However, there are more variants of parallelism in real hardware. -Because all threads in the grid are executing the same kernel and even the same instruction at the same time when ignoring divergent control flows, a lot of chip space can be saved. -Multiple threads can be executed in perfect synchronicity, which is also called lock-step. -A group of such threads executing the same instruction at the same time is called a warp . -All threads within a warp share a single instruction pointer (IP), and all cores executing the threads share one instruction fetch (IF) and instruction decode (ID) unit. -![warp](warp/warp.png) - -Even threads with divergent control flows can be executed within one warp. -*CUDA*, for example, solves this by supporting predicated execution and warp voting. -For long conditional branches the compiler inserts code which checks if all threads in the warp take the same branch. -For small branches, where this is too expensive, all threads always execute all branches. -Control flow statements result in a predicate and only in those threads where it is true, the predicated instructions will have an effect. - -Not only *CUDA* GPUs support the execution of multiple threads in a warp. -Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*. - -Due to the synchronictiy of threads within a warp, memory operations will always occur at the same time in all threads. -This allows to coalesce memory accesses. -Different *CUDA* devices support different levels of memory coalescing. -Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices. -Newer ones support unaligned scattered accesses as long as they target the same 128 byte segment. - -The ability of very fast context switches between warps and a queue of ready warps allows *CUDA* capable GPUs to hide the latency of global memory operations. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md deleted file mode 100644 index 3a0c042164..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md +++ /dev/null @@ -1,16 +0,0 @@ -[:arrow_up: Up](../Implementation.md) - -Library Interface -================= - -As described in the chapter about the Abstraction, the general design of the library is very similar to *CUDA* and *OpenCL* but extends both by some points, while not requiring any language extensions. -General interface design as well as interface implementation decisions differentiating *alpaka* from those libraries are described in the Rationale section. -It uses C++ because it is one of the most performant languages available on nearly all systems. -Furthermore, C++11 allows to describe the concepts in a very abstract way that is not possible with many other languages. -The *alpaka* library extensively makes use of advanced functional C++ template meta-programming techniques. -The Implementation Details section discusses the C++ library and the way it provides extensibility and optimizability. - -1. [Structure](library/Structure.md) -2. [Usage](library/Usage.md) -2. [Rationale](library/Rationale.md) -3. [Details](library/Details.md) diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md deleted file mode 100644 index 70a28fb6b5..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md +++ /dev/null @@ -1,24 +0,0 @@ -[:arrow_up: Up](../Implementation.md) - -Mapping onto Specific Hardware Architectures -============================================ - -By providing an accelerator independent interface for kernels, their execution and memory accesses at different hierarchy levels, *alpaka* allows the user to write accelerator independent code that does not neglect performance. - -The mapping of the decomposition to the execution environment is handled by the back-ends provided by the *alpaka* library as well as user defined back-ends. -A computation that is described with a maximum of the parallelism available in the *redundant hierarchical parallelism* abstraction can not be mapped one to one to any existing hardware. -GPUs do not have vector registers for `float` or `double` types. -Therefore, the element level is often omitted on *CUDA* accelerators. -CPUs in turn are not (currently) capable of running thousands of threads concurrently and do not have equivalently fast inter-thread synchronization and shared memory access as GPUs do. - -A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator. -This allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities. - -The grid level is always mapped to the whole device being in consideration. -The scheduler can always execute multiple kernel grids from multiple queues in parallel by statically or dynamically subdividing the available resources. -However, this will only ever simplify the mapping due to less available processing units. -Furthermore, being restricted to less resources automatically improves the locality of data due to spatial and temporal locality properties of the caching hierarchy. - -1. [CUDA GPUs](mapping/CUDA.md) -2. [x86 CPUs](mapping/x86.md) -2. [Accelerators](mapping/Accelerators.md) diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md deleted file mode 100644 index e0a2c88b2d..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md +++ /dev/null @@ -1,35 +0,0 @@ -[:arrow_up: Up](../Library.md) - -Structure -========= - -The *alpaka* library allows offloading of computations from the host execution domain to the accelerator execution domain, whereby they are allowed to be identical. - -In the abstraction hierarchy the library code is interleaved with user supplied code as is depicted in the following figure. -![Execution Domains](execution_domain.png) -User code invokes library functions, which in turn execute the user provided thread function (kernel) in parallel on the accelerator. -The kernel in turn calls library functions when accessing accelerator properties and methods. -Additionally, the user can enhance or optimize the library implementations by extending or replacing specific parts. - -The *alpaka* abstraction itself only defines requirements a type has to fulfill to be usable with the template functions the library provides. -These type constraints are called concepts in C++. - -*A concept is a set of requirements consisting of valid expressions, associated types, invariants, and complexity guarantees. -A type that satisfies the requirements is said to model the concept. -A concept can extend the requirements of another concept, which is called refinement.* [BoostConcepts](http://www.boost.org/community/generic_programming.html) - -Concepts allow to safely define polymorphic algorithms that work with objects of many different types. - -The *alpaka* library implements a stack of concepts and their interactions modeling the abstraction defined in the previous chapter. -Furthermore, default implementations for various devices and accelerators modeling those are included in the library. -The interaction of the main user facing concepts can be seen in the following figure. -![user / alpaka code interaction](structure_assoc.png) - -For each type of `Device` there is a `Platform` for enumerating the available `Device`s. -A `Device` is the requirement for creating `Queues` and `Events` as it is for allocating `Buffers` on the respective `Device`. `Buffers` can be copied, their memory be set and they can be pinned or mapped. -Copying and setting a buffer requires the corresponding `Copy` and `Set` tasks to be enqueued into the `Queue`. -An `Event` can be enqueued into a `Queue` and its completion state can be queried by the user. -It is possible to wait for (synchronize with) a single `Event`, a `Queue` or a whole `Device`. -An `Executor` can be enqueued into a `Queue` and will execute the `Kernel` (after all previous tasks in the queue have been completed). -The `Kernel` in turn has access to the `Accelerator` it is running on. -The `Accelerator` provides the `Kernel` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md deleted file mode 100644 index 52382c47e5..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md +++ /dev/null @@ -1,112 +0,0 @@ -[:arrow_up: Up](../Library.md) - -Interface Usage -=============== - -Accelerator Executable Functions --------------------------------- - -Functions that should be executable on an accelerator have to be annotated with the execution domain (one of `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC`). -They most probably also require access to the accelerator data and methods, such as indices and extents as well as functions to allocate shared memory and to synchronize all threads within a block. -Therefore the accelerator has to be passed in as a templated constant reference parameter as can be seen in the following code snippet. - -```C++ -template< - typename TAcc> -ALPAKA_FN_ACC auto doSomethingOnAccelerator( - TAcc const & acc/*, - ...*/) // Arbitrary number of parameters --> int // Arbitrary return type -{ - //... -} -``` - - -Kernel Definition ------------------ - -A kernel is a special function object which has to conform to the following requirements: -* it has to fulfill the `std::is_trivially_copyable` trait (has to be copyable via memcpy) -* the `operator()` is the kernel entry point - * it has to be an accelerator executable function - * it has to return `void`. - * its first argument has to be the accelerator (templated for arbitrary accelerator backends). - -The following code snippet shows a basic example of a kernel function object. - -```C++ -struct MyKernel -{ - template< - typename TAcc> // Templated on the accelerator type. - ALPAKA_FN_ACC // Macro marking the function to be executable on all accelerators. - auto operator()( // The function / kernel to execute. - TAcc const & acc/*, // The specific accelerator implementation. - ...*/) const // Must be 'const'. - -> void - { - //... - } - // Class can have members but has to be std::is_trivially_copyable. - // Classes must not have pointers or references to host memory! -}; -``` - -The kernel function object is shared across all threads in all blocks. -Due to the block execution order being undefined, there is no safe and consistent way of altering state that is stored inside of the function object. -Therefore, the `operator()` of the kernel function object has to be `const` and is not allowed to modify any of the object members. - - -Index and Work Division ------------------------ - -The `alpaka::workdiv::getWorkDiv` and the `alpaka::idx::getIdx` functions both return a vector of the dimensionality the accelerator has been defined with. -They are parametrized by the origin of the calculation as well as the unit in which the values are calculated. -For example, `alpaka::workdiv::getWorkDiv(acc)` returns a vector with the extents of the grid in units of threads. - - -Memory Management ------------------ - -The memory allocation function of the *alpaka* library (`alpaka::mem::buf::alloc(device, extents)`) is uniform for all devices, even for the host device. -It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks. -Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on. -This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (`alpaka::mem::view::copy(bufDevA, bufDevB, copyExtents`). - -Kernel Execution ----------------- - -The following source code listing shows the execution of a kernel by enqueuing the execution task into a queue. - -```C++ -// Define the dimensionality of the task. -using Dim = alpaka::dim::DimInt<1u>; -// Define the type of the indexes. -using Idx = std::size_t; -// Define the accelerator to use. -using Acc = alpaka::acc::AccCpuSerial; -// Select the queue type. -using Queue = alpaka::queue::QueueCpuNonBlocking; - -// Select a device to execute on. -auto devAcc(alpaka::pltf::getDevByIdx(0)); -// Create a queue to enqueue the execution into. -Queue queue(devAcc); - -// Create a 1-dimensional work division with 256 blocks a 16 threads. -auto const workDiv(alpaka::workdiv::WorkDivMembers(256u, 16u); -// Create an instance of the kernel function object. -MyKernel kernel; -// Enqueue the execution task into the queue. -alpaka::kernel::exec(queue, workDiv, kernel/*, arguments ...*/); -``` - -The dimensionality of the task as well as the type for index and extent have to be defined explicitly. -Following this, the type of accelerator to execute on, as well as the type of the queue have to be defined. -For both of these types instances have to be created. -For the accelerator this has to be done indirectly by enumerating the required device via the device manager, whereas the queue can be created directly. - -To execute the kernel, an instance of the kernel function object has to be constructed. -Following this, an execution task combining the work division (grid and block sizes) with the kernel function object and the bound invocation arguments has to be created. -After that this task can be enqueued into a queue for immediate or later execution (depending on the queue used). diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md deleted file mode 100644 index 1cf9440c94..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md +++ /dev/null @@ -1,76 +0,0 @@ -[:arrow_up: Up](../Mapping.md) - -Accelerator Implementations -=========================== - -|alpaka|Serial|std::thread|Boost.Fiber|OpenMP 2.0|OpenMP 4.0|CUDA 9.0+| -|---|---|---|---|---|---|---| -|Devices|Host Core|Host Cores|Host Core|Host Cores|Host Cores|NVIDIA GPUs| -|Lib/API|n/a|std::thread|boost::fibers::fiber|OpenMP 2.0|OpenMP 4.0|CUDA 9.0+| -|Kernel execution|n/a|std::thread(kernel)|boost::fibers::fiber(kernel)|omp_set_dynamic(0), #pragma omp parallel num_threads(iNumKernelsInBlock)|#pragma omp target, #pragma omp teams num_teams(...) thread_limit(...), #pragma omp distribute, #pragma omp parallel num_threads(...)|cudaConfigureCall, cudaSetupArgument, cudaLaunch| -|Execution strategy grid-blocks|sequential|sequential|sequential|sequential|undefined|undefined| -|Execution strategy block-kernels|sequential|preemptive multitasking|cooperative multithreading|preemptive multitasking|preemptive multitasking|lock-step within warps| -|getIdx|n/a|*block-kernel*: mapping of std::this_thread::get_id() *grid-block*: member variable|*block-kernel*: mapping of std::this_fiber::get_id() *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|threadIdx, blockIdx| -|getExtent|member variables|member variables|member variables|member variables|member variables|gridDim, blockDim| -|getBlockSharedExternMem|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|\__shared__| -|allocBlockSharedMem|master thread allocates|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|\__shared__| -|syncBlockKernels|n/a|barrier|barrier|#pragma omp barrier|#pragma omp barrier|__syncthreads| -|atomicOp|n/a|std::lock_guard< std::mutex >|n/a|#pragma omp critical|#pragma omp critical|atomicXXX| -|ALPAKA_FN_HOST_ACC, ALPAKA_FN_ACC, ALPAKA_FN_HOST|inline|inline|inline|inline|inline|\__device__, \__host__, \__forceinline__| - - -### Serial - -The serial accelerator only allows blocks with exactly one thread. -Therefore it does not implement real synchronization or atomic primitives. - -### Threads - -#### Execution - -To prevent recreation of the threads between execution of different blocks in the grid, the threads are stored inside a thread pool. -This thread pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage and lots of idling kernel-threads when there are multiple KernelExecutors around. -Because the default policy of the threads in the pool is to yield instead of waiting, this would also slow down the system immensely. - -### Fibers - -#### Execution - -To prevent recreation of the fibers between execution of different blocks in the grid, the fibers are stored inside a fibers pool. -This fiber pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage when there are multiple KernelExecutors around. - -### OpenMP - -#### Execution - -Parallel execution of the kernels in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line. -So we have to spawn one real thread per kernel in a block. -`omp for` is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required. -Therefore we use `omp parallel` with the specified number of threads in a block. -Another reason for not using `omp for` like `#pragma omp parallel for collapse(3) num_threads(blockDim.x*blockDim.y*blockDim.z)` is that `#pragma omp barrier` used for intra block synchronization is not allowed inside `omp for` blocks. - -Because OpenMP is designed for a 1:1 abstraction of hardware to software threads, the block size is restricted by the number of OpenMP threads allowed by the runtime. -This could be as little as 2 or 4 kernels but on a system with 4 cores and hyper-threading OpenMP can also allow 64 threads. - -#### Index - -OpenMP only provides a linear thread index. This index is converted to a 3 dimensional index at runtime. - -#### Atomic - -We can not use '#pragma omp atomic' because braces or calling other functions directly after `#pragma omp atomic` are not allowed. -Because we are implementing the CUDA atomic operations which return the old value, this requires `#pragma omp critical` to be used. -`omp_set_lock` is an alternative but is usually slower. - -### CUDA - -Nearly all CUDA functionality can be directly mapped to alpaka function calls. -A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. -Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. -Dimension 0 in this case is z, dimensions 2 is x. - -Furthermore alpaka does not require the indices and extents to be 3-dimensional. -The accelerators are templatized on and support arbitrary dimensionality. -NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions! - -NOTE: The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function! diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md deleted file mode 100644 index b61583b209..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md +++ /dev/null @@ -1,243 +0,0 @@ -[:arrow_up: Up](../Mapping.md) - -CUDA GPUs -========= - -Mapping the abstraction to GPUs supporting *CUDA* is straightforward because the hierarchy levels are identical up to the element level. -So blocks of warps of threads will be mapped directly to their *CUDA* equivalent. - -The element level is supported through an additional run-time variable containing the extent of elements per thread. -This variable can be accessed by all threads and should optimally be placed in constant device memory for fast access. - -Porting CUDA to *alpaka* ------------------------- - -Nearly all CUDA functionality can be directly mapped to alpaka function calls. -A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. In both cases x is the innermost / fast running index. - -Furthermore alpaka does not require the indices and extents to be 3-dimensional. -The accelerators are templatized on and support arbitrary dimensionality. -NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions! - -NOTE: You have to be careful when mixing alpaka and non alpaka CUDA code. The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function. - - -### Programming Interface - -*Function Attributes* - -|CUDA|alpaka| -|---|---| -|\_\_host\_\_|ALPAKA_FN_HOST| -|\_\_device\_\_|ALPAKA_FN_ACC*| -|\_\_global\_\_|ALPAKA_FN_ACC*| -|\_\_host\_\_ \_\_device\_\_|ALPAKA_FN_HOST_ACC| - -\* You can not call CUDA only methods except when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled. - -*Memory* - -|CUDA|alpaka| -|---|---| -|\_\_shared\_\_|[alpaka::block::shared::st::allocVar(acc)](../../../../../test/unit/block/shared/src/BlockSharedMemSt.cpp#L69)| -|\_\_constant\_\_|[ALPAKA_STATIC_ACC_MEM_CONSTANT](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L58-L63)| -|\_\_device\_\_|[ALPAKA_STATIC_ACC_MEM_GLOBAL](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L164-L169)| - -*Index / Work Division* - -|CUDA|alpaka| -|---|---| -|threadIdx|alpaka::idx::getIdx(acc)| -|blockIdx|alpaka::idx::getIdx(acc)| -|blockDim|alpaka::workdiv::getWorkDiv(acc)| -|gridDim|alpaka::workdiv::getWorkDiv(acc)| - -*Types* - -|CUDA|alpaka| -|---|---| -|dim3|[alpaka::vec::Vec< TDim, TVal >](../../../../../test/unit/vec/src/VecTest.cpp#L43-L45)| - - -### CUDA Runtime API - -The following tables list the functions available in the [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules) and their equivalent alpaka functions: - -*Device Management* - -|CUDA|alpaka| -|---|---| -|cudaChooseDevice|-| -|cudaDeviceGetAttribute|-| -|cudaDeviceGetByPCIBusId|-| -|cudaDeviceGetCacheConfig|-| -|cudaDeviceGetLimit|-| -|cudaDeviceGetP2PAttribute|-| -|cudaDeviceGetPCIBusId|-| -|cudaDeviceGetSharedMemConfig|-| -|cudaDeviceGetQueuePriorityRange|-| -|cudaDeviceReset|alpaka::dev::reset(device)| -|cudaDeviceSetCacheConfig|-| -|cudaDeviceSetLimit|-| -|cudaDeviceSetSharedMemConfig|-| -|cudaDeviceSynchronize|void alpaka::wait::wait(device)| -|cudaGetDevice|n/a (no current device)| -|cudaGetDeviceCount|std::size_t alpaka::pltf::getDevCount< TPltf >()| -|cudaGetDeviceFlags|-| -|cudaGetDeviceProperties|alpaka::acc::getAccDevProps(dev) *NOTE: Only some properties available*| -|cudaIpcCloseMemHandle|-| -|cudaIpcGetEventHandle|-| -|cudaIpcGetMemHandle|-| -|cudaIpcOpenEventHandle|-| -|cudaIpcOpenMemHandle|-| -|cudaSetDevice|n/a (no current device)| -|cudaSetDeviceFlags|-| -|cudaSetValidDevices|-| - -*Error Handling* - -|CUDA|alpaka| -|---|---| -|cudaGetErrorName|n/a (handled internally, available in exception message)| -|cudaGetErrorString|n/a (handled internally, available in exception message)| -|cudaGetLastError|n/a (handled internally)| -|cudaPeekAtLastError|n/a (handled internally)| - -*Queue Management* - -|CUDA|alpaka| -|---|---| -|cudaStreamAddCallback|alpaka::queue::enqueue(queue, \[\](){do_something();})| -|cudaStreamAttachMemAsync|-| -|cudaStreamCreate|
  • queue = alpaka::queue::QueueCudaRtNonBlocking(device);
  • queue = alpaka::queue::QueueCudaRtBlocking(device);
| -|cudaStreamCreateWithFlags|see cudaStreamCreate (cudaStreamNonBlocking hard coded)| -|cudaStreamCreateWithPriority|-| -|cudaStreamDestroy|n/a (Destructor)| -|cudaStreamGetFlags|-| -|cudaStreamGetPriority|-| -|cudaStreamQuery|bool alpaka::queue::empty(queue)| -|cudaStreamSynchronize|void alpaka::wait::wait(queue)| -|cudaStreamWaitEvent|void alpaka::wait::wait(queue, event)| - -*Event Management* - -|CUDA|alpaka| -|---|---| -|cudaEventCreate|alpaka::event::Event< TQueue > event(dev);| -|cudaEventCreateWithFlags|-| -|cudaEventDestroy|n/a (Destructor)| -|cudaEventElapsedTime|-| -|cudaEventQuery|bool alpaka::event::test(event)| -|cudaEventRecord|void alpaka::queue::enqueue(queue, event)| -|cudaEventSynchronize|void alpaka::wait::wait(event)| - -*Memory Management* - -|CUDA|alpaka| -|---|---| -|cudaArrayGetInfo|-| -|cudaFree|n/a (automatic memory management with reference counted memory handles)| -|cudaFreeArray|-| -|cudaFreeHost|n/a| -|cudaFreeMipmappedArray|-| -|cudaGetMipmappedArrayLevel|-| -|cudaGetSymbolAddress|-| -|cudaGetSymbolSize|-| -|cudaHostAlloc|n/a, the existing buffer can be pinned using alpaka::mem::buf::prepareForAsyncCopy(memBuf)| -|cudaHostGetDevicePointer|-| -|cudaHostGetFlags|-| -|cudaHostRegister|-| -|cudaHostUnregister|-| -|cudaMalloc|alpaka::mem::buf::alloc(device, extents1D)| -|cudaMalloc3D|alpaka::mem::buf::alloc(device, extents3D)| -|cudaMalloc3DArray|-| -|cudaMallocArray|-| -|cudaMallocHost|alpaka::mem::buf::alloc(device, extents) *1D, 2D, 3D suppoorted!*| -|cudaMallocManaged|-| -|cudaMallocMipmappedArray|-| -|cudaMallocPitch|alpaka::mem::alloc(device, extents2D)| -|cudaMemAdvise|-| -|cudaMemGetInfo|
  • alpaka::dev::getMemBytes
  • alpaka::dev::getFreeMemBytes
    • | -|cudaMemPrefetchAsync|-| -|cudaMemRangeGetAttribute|-| -|cudaMemRangeGetAttributes|-| -|cudaMemcpy|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)| -|cudaMemcpy2D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D)| -|cudaMemcpy2DArrayToArray|-| -|cudaMemcpy2DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D, queue)| -|cudaMemcpy2DFromArray|-| -|cudaMemcpy2DFromArrayAsync|-| -|cudaMemcpy2DToArray|-| -|cudaMemcpy2DToArrayAsync|-| -|cudaMemcpy3D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)| -|cudaMemcpy3DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)| -|cudaMemcpy3DPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)| -|cudaMemcpy3DPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)| -|cudaMemcpyArrayToArray|-| -|cudaMemcpyAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)| -|cudaMemcpyFromArray|-| -|cudaMemcpyFromArrayAsync|-| -|cudaMemcpyFromSymbol|-| -|cudaMemcpyFromSymbolAsync|-| -|cudaMemcpyPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)| -|cudaMemcpyPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)| -|cudaMemcpyToArray|-| -|cudaMemcpyToArrayAsync|-| -|cudaMemcpyToSymbol|-| -|cudaMemcpyToSymbolAsync|-| -|cudaMemset|alpaka::mem::view::set(memBufDst, byte, extents1D)| -|cudaMemset2D|alpaka::mem::view::set(memBufDst, byte, extents2D)| -|cudaMemset2DAsync|alpaka::mem::view::set(memBufDst, byte, extents2D, queue)| -|cudaMemset3D|alpaka::mem::view::set(memBufDst, byte, extents3D)| -|cudaMemset3DAsync|alpaka::mem::view::set(memBufDst, byte, extents3D, queue)| -|cudaMemsetAsync|alpaka::mem::view::set(memBufDst, byte, extents1D, queue)| -|make_cudaExtent|-| -|make_cudaPitchedPtr|-| -|make_cudaPos|-| -|cudaMemcpyHostToDevice|n/a (direction of copy is determined automatically)| -|cudaMemcpyDeviceToHost|n/a (direction of copy is determined automatically)| - -*Execution Control* - -|CUDA|alpaka| -|---|---| -|cudaFuncGetAttributes|-| -|cudaFuncSetCacheConfig|-| -|cudaFuncSetSharedMemConfig|-| -|cudaLaunchKernel|
      • alpaka::kernel::exec< TAcc >(queue, workDiv, kernel, params...)
      • alpaka::kernel::BlockSharedExternMemSizeBytes< TKernel< TAcc > >::getBlockSharedExternMemSizeBytes<...>(...)
      | -|cudaSetDoubleForDevice|n/a (alpaka assumes double support)| -|cudaSetDoubleForHost|n/a (alpaka assumes double support)| - -*Occupancy* - -|CUDA|alpaka| -|---|---| -|cudaOccupancyMaxActiveBlocksPerMultiprocessor|-| -|cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags|-| - - -*Unified Addressing* - -|CUDA|alpaka| -|---|---| -|cudaPointerGetAttributes|-| - -*Peer Device Memory Access* - -|CUDA|alpaka| -|---|---| -|cudaDeviceCanAccessPeer|-| -|cudaDeviceDisablePeerAccess|-| -|cudaDeviceEnablePeerAccess|automatically done when required| - -**OpenGL, Direct3D, VDPAU, EGL, Graphics Interoperability** - -*not available* - -**Texture/Surface Reference/Object Management** - -*not available* - -**Version Management** - -*not available* diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md deleted file mode 100644 index c30fe36371..0000000000 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md +++ /dev/null @@ -1,84 +0,0 @@ -## Current restrictions on HSA platform - -- Workaround for unsupported `syncthreads_{count|and|or}` depending of the hardware. - - uses temporary shared value and atomics -- Workaround for buggy `hipStreamQuery`, `hipStreamSynchronize`. - - `hipStreamQuery` and `hipStreamSynchronize` did not work in multithreaded environment -- Workaround for missing `cuStreamWaitValue32`. - - polls value each 10ms -- device constant memory not supported yet -- note, that `printf` in kernels is still not supported in HIP -- exclude `hipMalloc3D` and `hipMallocPitch` when size is zero otherwise they throw an Unknown Error -- `TestAccs` excludes 3D specialization of Hip back-end for now because `verifyBytesSet` fails in `memView` for 3D specialization -- `dim3` structure is not available on device (use `alpaka::vec::Vec` instead) -- a chain of functions must also provide correct host-device signatures - - e.g. a host function cannot be called from a host-device function -- AMD device architecture currently hardcoded in `alpakaConfig.cmake` - -## Compiling HIP from source - -Follow [this](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md "HIP installation") guide for installing HIP. -HIP requires either `nvcc` or ROCm with `hcc` to be installed on your system (see guide for further details). - -- If you want the hip binaries to be located in a directory that does not require superuser access, be sure to change the install directory of HIP by modifying the `CMAKE_INSTALL_PREFIX` cmake variable. -- Also, after the installation is complete, add the following line to the `.profile` file in your home directory, in order to add the path to the HIP binaries to PATH: -`PATH=$PATH:` - -```bash -git clone --recursive https://github.com/ROCm-Developer-Tools/HIP.git -cd "HIP" -mkdir -p build -cd build -cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX=${YOUR_HIP_INSTALL_DIR} -DBUILD_TESTING=OFF .. -make -make install -``` -Set the appropriate paths (edit `${YOUR_**}` variables). -```bash -# HIP_PATH required by HIP tools -export HIP_PATH=${YOUR_HIP_INSTALL_DIR} -# Paths required by HIP tools -export CUDA_PATH=${YOUR_CUDA_ROOT} -# - if required, path to HSA include, lib. Default /opt/rocm/hsa. -export HSA_PATH=${YOUR_HSA_PATH} -# HIP binaries and libraries -export PATH=${HIP_PATH}/bin:$PATH -export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${LD_LIBRARY_PATH} -``` -Test the HIP binaries. -```bash -# calls nvcc or clang -which hipcc -hipcc -V -which hipconfig -hipconfig -v -``` - - -## Verifying HIP installation -- If PATH points to the location of the HIP binaries, the following command should list several relevant environment variables, and also the selected compiler on your system-`hipconfig -f` -- Compile and run the [square sample](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square), as pointed out in the [original](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#verify-your-installation) HIP install guide. - -## Compiling examples with HIP back-end -As of now, the back-end has only been tested on the NVIDIA platform. -### NVIDIA Platform -* One issue in this branch of alpaka is that the host compiler flags don't propagate to the device compiler, as they do in CUDA. This is because a counterpart to the CUDA_PROPAGATE_HOST_FLAGS cmake variable has not been defined in the FindHIP.cmake file. -Alpaka forwards the host compiler flags in cmake to the `HIP_NVCC_FLAGS` cmake variable, which also takes user-given flags. To add flags to this variable, toggle the advanced mode in `ccmake`. - - -## Random Number Generator Library rocRAND for HIP back-end - -rocRAND provides an interface for HIP, where the cuRAND or rocRAND API is called depending on the chosen HIP platform (can be configured with cmake in alpaka). - -Clone the rocRAND repository, then build and install it: -```bash -git clone https://github.com/ROCmSoftwarePlatform/rocRAND -cd rocRAND -mkdir -p build -cd build -cmake -DCMAKE_INSTALL_PREFIX=${HIP_PATH} -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DCMAKE_MODULE_PATH=${HIP_PATH}/cmake .. -make -``` - -The `CMAKE_MODULE_PATH` is a cmake variable for locating module finding scripts like *FindHIP.cmake*. -The paths to the `rocRAND` library and include directories should be appended to the `CMAKE_PREFIX_PATH` variable. diff --git a/thirdParty/cupla/alpaka/doc/doxygen/Doxyfile b/thirdParty/cupla/alpaka/docs/Doxyfile similarity index 99% rename from thirdParty/cupla/alpaka/doc/doxygen/Doxyfile rename to thirdParty/cupla/alpaka/docs/Doxyfile index 83515d2139..0af16f9397 100644 --- a/thirdParty/cupla/alpaka/doc/doxygen/Doxyfile +++ b/thirdParty/cupla/alpaka/docs/Doxyfile @@ -51,14 +51,14 @@ PROJECT_BRIEF = "Abstraction Library for Parallel Kernel Acceleration" # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. -PROJECT_LOGO = alpaka_doxygen.png +PROJECT_LOGO = logo/alpaka_doxygen.png # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = +OUTPUT_DIRECTORY = doxygen/ # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -829,8 +829,8 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ../../include/ \ - ../../README.md +INPUT = ../include/ \ + ../README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1022,7 +1022,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = ../../README.md +USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing @@ -2029,7 +2029,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = NO +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of diff --git a/thirdParty/cupla/alpaka/docs/Makefile b/thirdParty/cupla/alpaka/docs/Makefile new file mode 100644 index 0000000000..ad9de6edbf --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/Makefile @@ -0,0 +1,22 @@ +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= --color +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +checklinks: + $(SPHINXBUILD) -b linkcheck $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)" + @echo + @echo "Check finished. Report is in $(BUILDDIR)." diff --git a/thirdParty/cupla/alpaka/docs/cheatsheet/README.md b/thirdParty/cupla/alpaka/docs/cheatsheet/README.md new file mode 100644 index 0000000000..35ed849a85 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/cheatsheet/README.md @@ -0,0 +1,17 @@ +# About + +The style sheet (cheatsheet.style) was originally developed Roberto Alsina. + +https://github.com/ralsina/rst-cheatsheet + +# Install + +``` bash +pip install rst2pdf +``` + +# Build + +``` bash +rst2pdf -s cheatsheet.style ../source/usage/cheatsheet.rst -o cheatsheet.pdf +``` diff --git a/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style b/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style new file mode 100644 index 0000000000..8e8162edc3 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style @@ -0,0 +1,158 @@ +{ + "pageSetup": {"margin-left": 8, + "margin-right": 8, + "margin-top": 8, + "margin-bottom": 8, + "spacing-header": 0, + "spacing-footer": 10, + "firstTemplate": "twoColumn", + "width": "29.7cm", + "height": "21cm" + }, + "pageTemplates" : { + "threeColumn": { + "frames": [ + ["2%", "0cm", "29.333%", "100%"], + ["35.333%", "0cm", "29.333%", "100%"], + ["68.666%", "0cm", "29.333%", "100%"] + ] + } + }, + "fontsAlias" : { + "stdMono": "CPMono_v07 Plain" + }, + "styles" : [ + [ "base", { + "fontSize": 10 + } + ], + ["code" , { + "parent": "literal", + "leftIndent": 0, + "spaceBefore": 0, + "spaceAfter": 4, + "backColor": null, + "borderColor": null, + "borderWidth": 0, + "leading":7, + "borderPadding": [1,1,5,1], + "fontSize": 8 + }], + ["bodytext" , { + "spaceBefore":0 + }], + ["small" , { + "parent": "base", + "fontSize": 6 + }], + ["heading1", { + "backColor": "#00599dff", + "borderColor": "#00599dff", + "borderWidth": 0.2, + "textColor": "#FFFFFF", + "leading": 10, + "alignment": "TA_CENTER", + "spaceBefore": 4, + "borderPadding": [3,0,5,0], + "leftIndent": 0, + "fontSize": 12, + "fontName": "stdSansBold" + }], + ["faketitle" , { + "borderPadding": [3,0,1,0], + "fontSize": 8, + "spaceBefore": 4, + "spaceAfter": 4, + "fontName": "stdSansBold" + }], + ["nota", { "parent": "heading", + "fontSize": 6, + "fontName": "stdSansBold", + "textColor": "#FFFFFF", + "alignment": "TA_RIGHT" + }], + ["table" , { + "spaceBefore":0, + "spaceAfter":3, + "colWidths": ["50%","50%"], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "TOP" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ], + [ "TOPPADDING", [0, 0], [-1, -1], 1 ], + [ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ] + ] + }], + ["exampletable1" , { + "spaceBefore":0, + "spaceAfter":3, + "colWidths": ["33.3%","33.3%","33.3%"], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "TOP" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ], + [ "GRID", [0, 0], [-1, -1], 0.2, "#446885" ], + [ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ] + ] + }], + ["faketrans" , { + "spaceBefore":3, + "spaceAfter":3, + "colWidths": ["100%"], + "commands": [ + [ "LINEABOVE", [0, 0], [-1, -1], 0.8, "#446885" ] + ] + }], + ["tablapie" , { + "spaceBefore":0, + "spaceAfter":0, + "colWidths": ["52%","19%","12%","17%"], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "TOP" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ], + [ "LINEABOVE", [0, 0], [-1, -1], 0.4, "#446885" ] + ] + }], + ["izqfina" , { + "spaceBefore":0, + "spaceAfter":6, + "colWidths": ["10%",null], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ], + [ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ] + ] + }], + ["tablacreditos", { + "parent": "bodytext", + "spaceBefore":-1, + "spaceAfter":0, + "colWidths": ["50%","50%"], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ], + [ "TOPPADDING", [0, 0], [1, 0], 3 ] + ] + }], + [ "endnote", { + "parent": "bodytext", + "colWidths": [52,null], + "spaceAfter": 4, + "commands": [ + [ "VALIGN", [ 0, 0 ], [ -1, -1 ], "TOP" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ], + [ "TOPPADDING", [0, 0], [-1, -1], 1 ], + [ "LINEBEFORE", [0, 0], [0,-1], 1, "#E1E6EA" ] + ] + }], + ["extranote" , { + "spaceBefore":0, + "spaceAfter":0, + "colWidths": [27,null], + "commands": [ + [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ], + [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ], + [ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ], + [ "COLBACKGROUNDS", [0,0], [-1,-1], ["#446885", "#FFFFFF"]] + ] + }] + ] +} diff --git a/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf b/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf new file mode 100644 index 0000000000..5cc2153bf4 Binary files /dev/null and b/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf differ diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka.svg b/thirdParty/cupla/alpaka/docs/logo/alpaka.svg similarity index 100% rename from thirdParty/cupla/alpaka/doc/images/alpaka.svg rename to thirdParty/cupla/alpaka/docs/logo/alpaka.svg diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka_401x135.png b/thirdParty/cupla/alpaka/docs/logo/alpaka_401x135.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/images/alpaka_401x135.png rename to thirdParty/cupla/alpaka/docs/logo/alpaka_401x135.png diff --git a/thirdParty/cupla/alpaka/doc/doxygen/alpaka_doxygen.png b/thirdParty/cupla/alpaka/docs/logo/alpaka_doxygen.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/doxygen/alpaka_doxygen.png rename to thirdParty/cupla/alpaka/docs/logo/alpaka_doxygen.png diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka_inkscape.svg b/thirdParty/cupla/alpaka/docs/logo/alpaka_inkscape.svg similarity index 100% rename from thirdParty/cupla/alpaka/doc/images/alpaka_inkscape.svg rename to thirdParty/cupla/alpaka/docs/logo/alpaka_inkscape.svg diff --git a/thirdParty/cupla/alpaka/docs/requirements.txt b/thirdParty/cupla/alpaka/docs/requirements.txt new file mode 100644 index 0000000000..431a616fe0 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/requirements.txt @@ -0,0 +1,12 @@ +sphinx_rtd_theme>=0.3.1 +#recommonmark +sphinx==3.0.3 +breathe==4.16.0 +sphinxcontrib.programoutput +#sphinxcontrib-napoleon>=0.7 +pygments +# generate plots +#matplotlib +#scipy +#numpy +rst2pdf diff --git a/thirdParty/cupla/alpaka/docs/source/_static/custom.css b/thirdParty/cupla/alpaka/docs/source/_static/custom.css new file mode 100644 index 0000000000..db502b1bf4 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/_static/custom.css @@ -0,0 +1,3 @@ +.section { + text-align:justify; +} diff --git a/thirdParty/cupla/alpaka/docs/source/_static/general.css b/thirdParty/cupla/alpaka/docs/source/_static/general.css new file mode 100644 index 0000000000..f0c574cb54 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/_static/general.css @@ -0,0 +1,4 @@ +/* justify the normal text blocks */ +.section { + text-align:justify; +} diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md b/thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst similarity index 74% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md rename to thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst index 0fca42c3f6..55ecc86dd7 100644 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md +++ b/thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst @@ -1,12 +1,30 @@ -[:arrow_up: Up](../Mapping.md) +Mapping onto Specific Hardware Architectures +============================================ + +By providing an accelerator independent interface for kernels, their execution and memory accesses at different hierarchy levels, *alpaka* allows the user to write accelerator independent code that does not neglect performance. + +The mapping of the decomposition to the execution environment is handled by the back-ends provided by the *alpaka* library as well as user defined back-ends. +A computation that is described with a maximum of the parallelism available in the *redundant hierarchical parallelism* abstraction can not be mapped one to one to any existing hardware. +GPUs do not have vector registers for ``float`` or ``double`` types. +Therefore, the element level is often omitted on *CUDA* accelerators. +CPUs in turn are not (currently) capable of running thousands of threads concurrently and do not have equivalently fast inter-thread synchronization and shared memory access as GPUs do. + +A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator. +This allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities. + +The grid level is always mapped to the whole device being in consideration. +The scheduler can always execute multiple kernel grids from multiple queues in parallel by statically or dynamically subdividing the available resources. +However, this will only ever simplify the mapping due to less available processing units. +Furthermore, being restricted to less resources automatically improves the locality of data due to spatial and temporal locality properties of the caching hierarchy. x86 CPUs -======== +```````` There are multiple possible ways to map the *alpaka* abstraction to x86 CPUs. The following figure shows the compute and memory hierarchy of a dual-socket (package) node with dual-core CPUs and symmetric multithreading (Hyper-Threading). Through symmetric multithreading (Hyper-Threading) each core represents two processing units. -![x86_cpu](x86/x86_cpu.png) + +.. image:: /images/x86_cpu.png Thread ------ @@ -18,21 +36,23 @@ Warp ---- Even though a warp seems to be identical to a vector register, because both execute a single uniform instruction on multiple data elements, they are not the same. -[Warps](../../Abstraction.md) can handle branches with divergent control flows of multiple threads. +:doc:`Warps ` can handle branches with divergent control flows of multiple threads. There is no equivalent hardware unit in a CPU supporting this. Therefore, the warp level can not be utilized on CPUs leading to a one-to-one mapping of threads to warps which does not violate the rules of the abstraction. Block ----- -### One Block Per Node +One Block Per Node +++++++++++++++++++ By combining all processing units (possibly Hyper-Threads) of all processors on a node into one block, the number of synchronizing and communicating threads can be enlarged. This high possible thread count would simplify the implementation of some types of algorithms but introduces performance issues on multi-core nodes. The shared memory between all cores on a node is the RAM. However, the RAM and the communication between the sockets is far too slow for fine-grained communication in the style of *CUDA* threads. -### One Block Per Socket +One Block Per Socket +++++++++++++++++++++ If each processor on each socket would concurrently execute one block, the L3 cache would be used as the fast shared memory. Although this is much better then to use the RAM, there is still a problem. @@ -50,14 +70,16 @@ This property is exploited on *CUDA* GPUs, where memory accesses within a warp a However, when multiple threads from multiple CPU cores write to different elements within a cache line, this advantage is reversed into its opposite. This pattern non-intuitively leads to heavy performance degradation and is called false-sharing. -### One Block Per Core +One Block Per Core +++++++++++++++++++ The best compromise between a high number of threads per block and a fast communication between the threads is to map a block directly to a CPU core. Each processing unit (possibly a Hyper-Thread) executes one or more threads of our hierarchical abstraction while executing multiple elements locally either by processing them sequentially or in a vectorized fashion. This possible mapping of blocks, threads and elements to the compute and memory hierarchy of a dual-socket node with dual-core CPUs and symmetric multithreading is illustrated in the following figure. ![x86_cpu](x86/x86_cpu_mapping.png) -### One Block Per Thread +One Block Per Thread +++++++++++++++++++++ If there is no symmetric multithreading or if it is desired, it is also possible to implement a mapping of one block with exactly one thread for each processing unit. This allows to completely remove the synchronization overhead for tasks where this is not required at all. @@ -79,7 +101,8 @@ This is called oversubscription. Those threads can be bound to the correct cores and by relying on the operating system thread scheduler, they are preemptively multitasked while sharing a single cache and thereby avoiding false-sharing. However, this is not always beneficial because the cost of thread changes by the kernel-mode scheduler should not be underestimated. -### Fibers +Fibers +++++++ To remove the overhead of the kernel mode scheduler as well as to enable the usage of deterministic thread context-switches, fibers can be used. A fiber is a user-space thread with cooperative context-switches and extends the concept of coroutines. @@ -95,3 +118,12 @@ An advantage of a user level scheduler over the operating system thread schedule Furthermore, fibers reduce the number of locks and busy waits within a block because only one fiber is active per operating system thread at a time. There are multiple C++ Standards Committee Papers (N3858, N3985, N4134) discussing the inclusion of fibers, awaitable functions and similar concepts into C++. + +GPUs (CUDA/HIP) +``````````````` + +Mapping the abstraction to GPUs supporting *CUDA* and *HIP* is straightforward because the hierarchy levels are identical up to the element level. +So blocks of warps of threads will be mapped directly to their *CUDA*/*HIP* equivalent. + +The element level is supported through an additional run-time variable containing the extent of elements per thread. +This variable can be accessed by all threads and should optimally be placed in constant device memory for fast access. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md b/thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst similarity index 69% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md rename to thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst index 4aa65958c9..ed43518418 100644 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md +++ b/thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst @@ -1,36 +1,41 @@ -[:arrow_up: Up](../Library.md) +.. highlight:: cpp + :linenothreshold: 5 Rationale ========= Interface Distinction --------------------- +--------------------- The *alpaka* library is different from other similar libraries (especially *CUDA*) in that it refrains from using implicit or hidden state. This and other interface design decisions will be explained int the following paragraphs. -### No Current Device: +No Current Device: +++++++++++++++++++ + The *CUDA* runtime API for example supplies a current device for each user code kernel-thread. -Working with multiple devices requires to call `cudaSetDevice` to change the current device whenever an operation should be executed on a non-current device. -Even the functions for creating a queue (`cudaStreamCreate`) or an event (`cudaEventCreate`) use the current device without any way to create them on a non current device. +Working with multiple devices requires to call ``cudaSetDevice`` to change the current device whenever an operation should be executed on a non-current device. +Even the functions for creating a queue (``cudaStreamCreate``) or an event (``cudaEventCreate``) use the current device without any way to create them on a non current device. In the case of an event this dependency is not obvious, since at the same time queues can wait for events from multiple devices allowing cross-device synchronization without any additional work. So conceptually an event could also have been implemented device independently. This can lead to hard to track down bugs due to the non-explicit dependencies, especially in multi-threaded code using multiple devices. -### No Default Device: +No Default Device: +++++++++++++++++++ + In contrast to the *CUDA* runtime API *alpaka* does not provide a device by default per kernel-thread. Especially in combination with *OpenMP* parallelized host code this keeps users from surprises. The following code snippet shows that it does not necessarily do what one would expect. -```C++ -cudaSetDevice(1); +.. code-block:: + + cudaSetDevice(1); -#pragma omp parallel for -for(int i = 0; i<10; ++i) -{ - kernel<<>>(i); -} -``` + #pragma omp parallel for + for(int i = 0; i<10; ++i) + { + kernel<<>>(i); + } Depending on what the *CUDA* runtime API selects as default device for each of the *OpenMP* threads (due to each of them having its own current device), not all of the kernels will necessarily run on device one. @@ -40,7 +45,9 @@ The *alpaka* *CUDA* back-end checks before forwarding the calls to the *CUDA* ru The *alpaka* *CUDA* back-end does not reset the current device to the one prior to the method invocation out of performance considerations. This has to be considered when native *CUDA* code is combined with *alpaka* code. -### No Default Queue: +No Default Queue: ++++++++++++++++++ + *CUDA* allows to execute commands without specifying a queue. The default queue that is used synchronizes implicitly with all other queues on the device. If a command queue is issued to the default, all other asynchronous queues have to wait before executing any new commands, even when they have been enqueued much earlier. @@ -49,7 +56,9 @@ As of *CUDA* 7.0 the default queue can be converted to a non synchronizing queue Because concurrency is crucial for performance and users should think about the dependencies between their commands from begin on, *alpaka* does not provide such a default queue. All asynchronous operations (kernel launches, memory copies and memory sets) require a queue to be executed in. -### No Implicit Built-in Variables and Functions: +No Implicit Built-in Variables and Functions: +--------------------------------------------- + Within *CUDA* device functions (functions annotated with `__global__` or `__device__`) built-in functions (`__syncthreads`, `__threadfence`, `atomicAdd`, ... ) and variables (`gridDim`, `blockIdx`, `blockDim`, `threadIdx`, `warpSize`, ...) are provided. It would have been possible to emulate those implicit definitions by forcing the kernel function object to inherit from a class providing these functions and members. @@ -63,11 +72,15 @@ The built-in variables can be accessed by the user via query functions on this a * Abandoning all the implicit and default state makes it much easier for users of the library to reason about their code. * -### No Language Extensions: +No Language Extensions: +----------------------- + Unlike *CUDA*, the *alpaka* library does not extend the C++ language with any additional variable qualifiers (`__shared__`, `__constant__`, `__device__`) defining the memory space. Instead of those qualifiers *alpaka* provides accelerator functions to allocate memory in different the different memory spaces. -### No Dimensionality Restriction: +No Dimensionality Restriction: +------------------------------ + *CUDA* always uses three-dimensional indices and extents, even though the task may only be one or two dimensional. *OpenCL* on the other hand allows grid and block dimensions in the range [1,3] but does not provide corresponding n-dimensional indices, but rather provides functions like `get_global_id` or `get_local_id`, which require the dimension in which the one-dimensional ID is to be queried as a parameter. By itself this is no problem, but how can be assured that a two-dimensional kernel is called with grid and block extents of the correct dimensionality at compile time? @@ -81,7 +94,9 @@ Furthermore with the dimension being a template parameter, the CPU back-end impl *By hiding all the accelerator functionality inside of the accelerator object that is passed to the user kernel, the user of the *alpaka* library is not faced with any non-standard C++ extensions. Nevertheless the *CUDA* back-end internally uses those language extensions.* -### Integral Sizes of Arbitrary Type: +Integral Sizes of Arbitrary Type: +--------------------------------- + The type of sizes such as extents, indices and related variables are depending on a template parameter of the accelerator and connected classes. This allows the kernel to be executed with sizes of arbitrary ranges. Thereby it is possible to force the accelerator back-ends to perform all internal index, extent and other integral size depending computations with a given precision. @@ -89,7 +104,9 @@ This is especially useful on current *NVIDIA* GPUs. Even though they support 64-bit integral operations, they are emulated with multiple 32-bit operations. This can be a huge performance penalty when the sizes of buffers, offsets, indices and other integral variables holding sizes are known to be limited. -### No synchronous (blocking) and asynchronous (non-blocking) function versions: +No Synchronous (Blocking) and Asynchronous (Non-Blocking) Function Versions: +---------------------------------------------------------------------------- + *CUDA* provides two versions of many of the runtime functions, for example, `cudaMemcpyAsync` and `cudaMemcpy`. The asynchronous version requires a queue while the synchronous version does not need a queue parameter. The asynchronous version immediately returns control back to the caller while the task is enqueued into the given queue and executed later in parallel to the host code. @@ -102,7 +119,9 @@ Non-blocking queues as well as blocking queues are provided for all devices. Changes to the synchronicity of multiple tasks can be made on a per queue basis by changing the queue type at the place of creation. There is no need to change any line of calling code. -### Memory Management +Memory Management +----------------- + Memory buffers can not only be identified by the pointer to their first byte. The C++ `new` and `malloc`, the *CUDA* `cudaMalloc` as well as the *OpenCL* `clCreateBuffer` functions all return a plain pointer. This is not enough when working with multiple accelerators and multiple devices. @@ -118,23 +137,23 @@ Many parallelization libraries / frameworks do not fully support the separation *OpenMP*, for example, fully mixes the per thread algorithm and the parallelization strategy. This can be seen in the source listing showing a simple AXPY computation with OpenMP. -```C++ -template< - typename TIdx, - typename TElem> -void axpyOpenMP( - TIdx const n, - TElem const alpha, - TElem const * const X, - TElem * const Y) -{ - #pragma omp parallel for - for (i=0; i + void axpyOpenMP( + TIdx const n, + TElem const alpha, + TElem const * const X, + TElem * const Y) + { + #pragma omp parallel for + for (i=0; i -__global__ void axpyCUDA( - TIdx const n, - TElem const alpha, - TElem const * const X, - TElem * const Y) -{ - TIdx const i(blockIdx.x*blockDim.x + threadIdx.x) - if(i < n) - { - Y[i] = alpha * X[i] + Y[i]; - } -} -``` +.. code-block:: + + template< + typename TIdx, + typename TElem> + __global__ void axpyCUDA( + TIdx const n, + TElem const alpha, + TElem const * const X, + TElem * const Y) + { + TIdx const i(blockIdx.x*blockDim.x + threadIdx.x) + if(i < n) + { + Y[i] = alpha * X[i] + Y[i]; + } + } On the other hand the *CUDA* implementation is bloated with code handling the inherent blocking scheme. Even if the algorithm does not utilize blocking, as it is the case here, the algorithm writer has to calculate the global index of the current thread by hand (line 10). @@ -175,78 +194,94 @@ Recombining multiple kernel iterations to loop over lines, columns, blocks or an In contrast, by using *OpenMP* this would not be possible. Therefore the *alpaka* interface builds on the kernel concept, being the body of the corresponding standard for loop executed in each thread. -### Execution Domain Specifications +Execution Domain Specifications +------------------------------- *CUDA* requires the user to annotate its functions with execution domain specifications. -Functions that can only be executed on the GPU have to be annotated with `__device__`, functions that can be executed on the host and on the GPU have to be annotated with `__host__ __device__` and host only functions can optionally be annotated with `__host__`. +Functions that can only be executed on the GPU have to be annotated with ``__device__``, functions that can be executed on the host and on the GPU have to be annotated with ``__host__ __device__`` and host only functions can optionally be annotated with ``__host__``. The nvcc *CUDA* compiler uses these annotations to decide with which back-ends a function has to be compiled. -Depending on the compiler in use, *alpaka* defines the macros `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC` with the identical meaning which can be used in the same positions. +Depending on the compiler in use, *alpaka* defines the macros ``ALPAKA_FN_HOST``, ``ALPAKA_FN_ACC`` and ``ALPAKA_FN_HOST_ACC`` with the identical meaning which can be used in the same positions. When the *CUDA* compiler is used, they are defined to their *CUDA* equivalents, else they are empty. -### Kernel Function +Kernel Function +--------------- -#### Requirements +Requirements +++++++++++++ - User kernels should be implemented independent of the accelerator. - A user kernel has to have access to accelerator methods (synchronization within blocks, index retrieval, ...). -- For usage with CUDA, the kernel methods have to be attributed with \__device\__ \__host\__. +- For usage with CUDA, the kernel methods have to be attributed with ``__device__ __host__``. - The user kernel has to fulfill std::is_trivially_copyable because only such objects can be copied into CUDA device memory. A trivially copyable class is a class that - 1. Has no non-trivial copy constructors(this also requires no virtual functions or virtual bases) - 2. Has no non-trivial move constructors - 3. Has no non-trivial copy assignment operators - 4. Has no non-trivial move assignment operators - 5. Has a trivial destructor + #. Has no non-trivial copy constructors(this also requires no virtual functions or virtual bases) + #. Has no non-trivial move constructors + #. Has no non-trivial copy assignment operators + #. Has no non-trivial move assignment operators + #. Has a trivial destructor -#### Implementation Variants +Implementation Variants ++++++++++++++++++++++++ There are two possible ways to tell the kernel about the accelerator type: - 1. The kernel is templated on the accelerator type ... - * + This allows users to specialize them for different accelerators. (Is this is really necessary or desired?) - * - The kernel has to be a class template. This does not allow C++ lambdas to be used as kernels because they are no templates themselves (but only their `operator()` can be templated). - * - This prevents the user from instantiating an accelerator independent kernel before executing it. - Because the memory layout in inheritance hierarchies is undefined a simple copy of the user kernel or its members to its specialized type is not possible platform independently. - This would require a copy from UserKernel to UserKernel to be possible. - The only way to allow this would be to require the user to implement a templated copy constructor for every kernel. - This is not allowed for kernels that should be copyable to a CUDA device because std::is_trivially_copyable requires the kernel to have no non-trivial copy constructors. - * a) ... and inherits from the accelerator. - * - The kernel itself has to inherit at least protected from the accelerator to allow the KernelExecutor to access the Accelerator. - * - How do accelerator functions called from the kernel (and not within the kernel class itself) access the accelerator methods? - Casting this to the accelerator type and giving it as parameter is too much to require from the user. - * b) ... and the `operator()` has a reference to the accelerator as parameter. - * + This allows to use the accelerator in functions called from the kernel (and not within the kernel class itself) to access the accelerator methods in the same way the kernel entry point function can. - * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless. - 2. The `operator()` is templated on the accelerator type and has a reference to the accelerator as parameter. - * + The kernel can be an arbitrary function object with ALPAKA_FN_HOST_ACC attributes. - * + This would allow to instantiate the accelerator independent kernel and set its members before execution. - * +/- usable with polymorphic lambdas. - * - The `operator()` could be overloaded on the accelerator type but there is no way to specialize the whole kernel class itself, so it always has the same members. - * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless. + +#. The kernel is templated on the accelerator type ... + + * (+) This allows users to specialize them for different accelerators. (Is this is really necessary or desired?) + * (-) The kernel has to be a class template. This does not allow C++ lambdas to be used as kernels because they are no templates themselves (but only their ``operator()`` can be templated). + * (-) This prevents the user from instantiating an accelerator independent kernel before executing it. + Because the memory layout in inheritance hierarchies is undefined a simple copy of the user kernel or its members to its specialized type is not possible platform independently. + This would require a copy from UserKernel to UserKernel to be possible. + The only way to allow this would be to require the user to implement a templated copy constructor for every kernel. + This is not allowed for kernels that should be copyable to a CUDA device because std::is_trivially_copyable requires the kernel to have no non-trivial copy constructors. + + a) ... and inherits from the accelerator. + + * (-) The kernel itself has to inherit at least protected from the accelerator to allow the KernelExecutor to access the Accelerator. + + * (-) How do accelerator functions called from the kernel (and not within the kernel class itself) access the accelerator methods? + + Casting this to the accelerator type and giving it as parameter is too much to require from the user. + b) ... and the ``operator()`` has a reference to the accelerator as parameter. + + * (+) This allows to use the accelerator in functions called from the kernel (and not within the kernel class itself) to access the accelerator methods in the same way the kernel entry point function can. + * (-) This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless. + +#. The ``operator()`` is templated on the accelerator type and has a reference to the accelerator as parameter. + + * (+) The kernel can be an arbitrary function object with ``ALPAKA_FN_HOST_ACC`` attributes. + * (+) This would allow to instantiate the accelerator independent kernel and set its members before execution. + * (+/-) usable with polymorphic lambdas. + * (-) The ``operator()`` could be overloaded on the accelerator type but there is no way to specialize the whole kernel class itself, so it always has the same members. + * (-) This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless. Currently we implement version 2. -#### Implementation Notes +Implementation Notes +++++++++++++++++++++ Unlike *CUDA*, the *alpaka* library does not differentiate between the kernel function that represents the entry point and other functions that can be executed on the accelerator. -The entry point function that has to be annotated with `__global__` in *CUDA* is internal to the *alpaka* *CUDA* back-end and is not exposed to the user. -It directly calls into the user supplied kernel function object whose invocation operator is declared with `ALPAKA_FN_ACC`, which equals `__device__` in *CUDA*. +The entry point function that has to be annotated with ``__global__`` in *CUDA* is internal to the *alpaka* *CUDA* back-end and is not exposed to the user. +It directly calls into the user supplied kernel function object whose invocation operator is declared with ``ALPAKA_FN_ACC``, which equals ``__device__`` in *CUDA*. In this respect there is no difference between the kernel entry point function and any other accelerator function in *alpaka*. -The `operator()` of the kernel function object has to be `const`. +The ``operator()`` of the kernel function object has to be ``const``. This is especially important for the *CUDA* back-end, as it could possibly use the constant memory of the GPU to store the function object. The constant memory is a fast, cached, read-only memory that is beneficial when all threads uniformly read from the same address at the same time. In this case it is as fast as a read from a register. -### Access to accelerator dependent functionality +Access to Accelerator-Dependent Functionality ++++++++++++++++++++++++++++++++++++++++++++++ There are two possible ways to implement access to accelerator dependent functionality inside a kernel: -* Making the functions/templates members of the accelerator (maybe by inheritance) and calling them like `acc.syncThreads()` or `acc.template getIdx()`. -This would require the user to know and understand when to use the template keyword inside dependent type object function calls. -* The functions are only light wrappers around traits that can be specialized taking the accelerator as first value (it can not be the last value because of the potential use of variadic arguments). -The resulting code would look like `sync(acc)` or `getIdx(acc)`. -Internally these wrappers would call trait templates that are specialized for the specific accelerator e.g. `template Sync{...};` + +* Making the functions/templates members of the accelerator (maybe by inheritance) and calling them like ``acc.syncThreads()`` or ``acc.template getIdx()``. + This would require the user to know and understand when to use the template keyword inside dependent type object function calls. +* The functions are only light wrappers around traits that can be specialized taking the accelerator as first value (it can not be the last value because of the potential use of variadic arguments). + The resulting code would look like ``sync(acc)`` or ``getIdx(acc)``. + Internally these wrappers would call trait templates that are specialized for the specific accelerator e.g. ``template Sync{...};`` The second version is easier to understand and usually shorter to use in user code. @@ -254,8 +289,8 @@ The second version is easier to understand and usually shorter to use in user co Index and Work Division ----------------------- -*CUDA* requires the user to calculate the global index of the current thread within the grid by hand (already shown as `axpyCUDA`). -On the contrary, *OpenCL* provides the methods `get_global_size`, `get_global_id`, `get_local_size` and `get_local_id`. +*CUDA* requires the user to calculate the global index of the current thread within the grid by hand (already shown as ``axpyCUDA``). +On the contrary, *OpenCL* provides the methods ``get_global_size``, ``get_global_id``, ``get_local_size`` and ``get_local_id``. Called with the required dimension, they return the corresponding local or global index or extent (size). In *alpaka* this idea is extended to all dimensions. To unify the method interface and to avoid confusion between the differing terms and meanings of the functions in *OpenCL* and *CUDA*, in *alpaka* these methods are template functions. @@ -263,16 +298,19 @@ To unify the method interface and to avoid confusion between the differing terms Block Shared Memory ------------------- - -### Static Block Shared Memory + +Static Block Shared Memory +++++++++++++++++++++++++++ The size of block shared memory that is allocated inside the kernel is required to be given as compile time constant. This is due to CUDA not allowing to allocate block shared memory inside a kernel at runtime. - -### Dynamic Block Shared Memory + +Dynamic Block Shared Memory ++++++++++++++++++++++++++++ The size of the external block shared memory is obtained from a trait that can be specialized for each kernel. The trait is called with the current kernel invocation parameters and the block-element extent prior to each kernel execution. Because the block shared memory size is only ever constant or dependent on the block-element extent or the parameters of the invocation this has multiple advantages: + * It forces the separation of the kernel invocation from the calculation of the required block shared memory size. * It lets the user write this calculation once instead of multiple times spread across the code. diff --git a/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst b/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst new file mode 100644 index 0000000000..6ecd599a81 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst @@ -0,0 +1,275 @@ +Abstraction +=========== + +.. note:: + + Objective of the abstraction is to separate the parallelization strategy from the algorithm itself. + Algorithm code written by users should not depend on any parallelization library or specific strategy. + This would enable exchanging the parallelization back-end without any changes to the algorithm itself. + Besides allowing to test different parallelization strategies this also makes it possible to port algorithms to new, yet unsupported, platforms. + +Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators. +Within this chapter an abstraction will be derive that tries to provide a maximum of parallelism while simultaneously considering implementability and applicability in hardware. + +Looking at the current HPC hardware landscape, we often see nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi, each with their own processing units. +Within a CPU or a Intel Xeon Phi there are cores with hyper-threads, vector units and a large caching infrastructure. +Within a GPU there are many small cores and only few caches. +Each entity in the hierarchy has access to different memories. +For example, each socket / processor manages its RAM, while the cores additionally have non-explicit access to L3, L2 and L1 caches. +On a GPU there are global, constant, shared and other memory types which all can be accessed explicitly. +The interface has to abstract from these differences without sacrificing speed on any platform. + +A process running on a multi-socket node is the largest entity within *alpaka*. +The abstraction is only about the task and data parallel execution on the process/node level and down. +It does not provide any primitives for inter-node communication. +However, such libraries can be combined with *alpaka*. + +An application process always has a main thread and is by definition running on the host. +It can access the host memory and various accelerator devices. +Such accelerators can be GPUs, Intel Xeon Phis, the host itself or other devices. +Thus, the host not necessarily has to be different from the accelerator device used for the computations. +For instance, an Intel Xeon Phi simultaneously can be the host and the accelerator device. + +The *alpaka* library can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices. + +Task Parallelism +---------------- + +One of the basic building blocks of modern applications is task parallelism. +For example, the operating system scheduler, deciding which thread of which process gets how many processing time on which CPU core, enables task parallelism of applications. +It controls the execution of different tasks on different processing units. +Such task parallelism can be, for instance, the output of the progress in parallel to a download. +This can be implemented via two threads executing two different tasks. + +The valid dependencies between tasks within an application can be defined as a DAG (directed acyclic graph) in all cases. +The tasks are represented by nodes and the dependencies by edges. +In this model, a task is ready to be executed if the number of incoming edges is zero. +After a task finished it's work, it is removed from the graph as well as all of it's outgoing edges,. +This reduces the number of incoming edges of subsequent tasks. + +The problem with this model is the inherent overhead and the missing hardware and API support. +When it is directly implemented as a graph, at least all depending tasks have to be updated and checked if they are ready to be executed after a task finished. +Depending on the size of the graph and the number of edges this can be a huge overhead. + +*OpenCL* allows to define a task graph in a somewhat different way. +Tasks can be enqueued into an out-of-order command queue combined with events that have to be finished before the newly enqueued task can be started. +Tasks in the command queue with unmet dependencies are skipped and subsequent ones are executed. +The ``CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE`` property of a command queue is an optional feature only supported by few vendors. +Therefore, it can not be assumed to be available on all systems. + +*CUDA* on the other hand does currently (version 7.5) not support such out-of-order queues in any way. +The user has to define dependencies explicitly through the order the tasks are enqueued into the queues (called queues in *CUDA*). +Within a queue, tasks are always executed in sequential order, while multiple queues are executed in parallel. +Queues can wait for events enqueued into other queues. + +In both APIs, *OpenCL* and *CUDA*, a task graph can be emulated by creating one queue per task and enqueuing a unique event after each task, which can be used to wait for the preceding task. +However, this is not feasible due to the large queue and event creation costs as well as other overheads within this process. + +Therefore, to be compatible with a wide range of APIs, the interface for task parallelism has to be constrained. +Instead of a general DAG, multiple queues of sequentially executed tasks will be used to describe task parallelism. +Events that can be enqueued into the queues enhance the basic task parallelism by enabling synchronization between different queues, devices or the host threads. + +Data Parallelism +---------------- + +In contrast to task parallelism, data parallelism describes the execution of one and the same task on multiple, often related data elements. +For example, an image color space conversion is a textbook example of a data parallel task. +The same operation is executed independently on each pixel. +Other data parallel algorithms additionally introduce dependencies between threads in the input-, intermediate-, or output-data. +For example, the calculation of a brightness histogram has no input-data dependencies. +However, all pixel brightness values finally have to be merged into a single result. +Even these two simple examples show that it is necessary to think about the interaction of parallel entities to minimize the influence of data dependencies. + +Furthermore, it is necessary to respect the principles of spatial and temporal locality. +Current hardware is built around these locality principles to reduce latency by using hierarchical memory as a trade-off between speed and hardware size. +Multiple levels of caches, from small and very fast ones to very large and slower ones exploit temporal locality by keeping recently referenced data as close to the actual processing units as possible. +Spatial locality in the main memory is also important for caches because they are usually divided into multiple lines that can only be exchanged one cache line at a time. +If one data element is loaded and cached, it is highly likely that nearby elements are also cached. +If the pixels of an image are stored row wise but are read out column wise, the spatial locality assumption of many CPUs is violated and the performance suffers. +GPUs on the other hand do not have a large caching hierarchy but allow explicit access to a fast memory shared across multiple cores. +Therefore, the best way to process individual data elements of a data parallel task is dependent on the data structure as well as the underlying hardware. + +The main part of the *alpaka* abstraction is the way it abstracts data parallelism and allows the algorithm writer to take into account the hierarchy of processing units, their data parallel features and corresponding memory regions. +The abstraction developed is influenced and based on the groundbreaking *CUDA* and *OpenCL* abstractions of a multidimensional grid of threads with additional hierarchy levels in between. +Another level of parallelism is added to those abstractions to unify the data parallel capabilities of modern hardware architectures. +The explicit access to all hierarchy levels enables the user to write code that runs performant on all current platforms. +However, the abstraction does not try to automatically optimize memory accesses or data structures but gives the user full freedom to use data structures matching the underlying hardware preferences. + +Thread +`````` + +Theoretically, a basic data parallel task can be executed optimally by executing one thread per independent data element. +In this context, the term thread does not correspond to a native kernel-thread, an *OpenMP* thread, a *CUDA* thread, a user-level thread or any other such threading variant. +It only represents the execution of a sequence of commands forming the desired algorithm on a per data element level. +This ideal one-to-one mapping of data elements to threads leads to the execution of a multidimensional grid of threads corresponding to the data structure of the underlying problem. +The uniform function executed by each of the threads is called a kernel. +Some algorithms such as reductions require the possibility to synchronize or communicate between threads to calculate a correct result in a time optimal manner. +Therefore our basic abstraction requires a n-dimensional grid of synchronizable threads each executing the same kernel. +The following figure shows an hypothetical processing unit that could optimally execute this data parallel task. +The threads are mapped one-to-one to the cores of the processor. +For a time optimal execution, the cores have to have an all-to-all equal length connection for communication and synchronization. + +.. image:: /images/thread.png + +The only difference between the threads is their positional index into the grid which allows each thread to compute a different part of the solution. +Threads can always access their private registers and the global memory. + +Registers ++++++++++ + +All variables with default scope within a kernel are automatically saved in registers and are not shared automatically. +This memory is local to each thread and can not be accessed by other threads. + +Global Memory ++++++++++++++ + +The global memory can be accessed from every thread in the grid as well as from the host thread. +This is typically the largest but also the slowest memory available. + +Individual threads within the grid are allowed to statically or dynamically allocate buffers in the global memory. + +Prior to the execution of a task, the host thread copies the input buffers and allocates the output buffers onto the accelerator device. +Pointers to these buffers then can be given as arguments to the task invocation. +By using the index of each thread within the grid, the offset into the global input and output buffers can be calculated. +After the computation has finished, the output buffer can be used either as input to a subsequent task or can be copied back to the host. + +Block +````` + +Building a processor with possibly thousands of cores where all cores have an equal length connection for fast communication and synchronization is not viable. +Either the processor size would have to grow exponentially with the number of cores or the all-to-all communication speed would decrease so much that computations on the processor would be impractical. +Therefore, the communication and synchronization of threads has to be limited to sizes manageable by real hardware. + +Figure \ref{fig:block} depicts the solution of introducing a new hierarchy level in the abstraction. +A hypothetical processor is allowed to provide synchronization and fast communication within blocks of threads but is not required to provide synchronization across blocks. +The whole grid is subdivided into equal sized blocks with a fast but small shared memory. +Current accelerator abstractions (*CUDA* and *OpenCL*) only support equal sized blocks. +This restriction could possibly be lifted to support future accelerators with heterogeneous block sizes. + +.. image:: /images/block.png + +There is another reason why independent blocks are necessary. +Threads that can communicate and synchronize require either a one-to-one mapping of threads to cores, which is impossible because the number of data elements is theoretically unlimited, or at least a space to store the state of each thread. +Even old single core CPUs were able to execute many communicating and synchronizing threads by using cooperative or preemptive multitasking. +Therefore, one might think that a single core would be enough to execute all the data parallel threads. +But the problem is that even storing the set of registers and local data of all the possible millions of threads of a task grid is not always viable. +The blocking scheme solves this by enabling fast interaction of threads on a local scale but additionally removes the necessity to store the state of all threads in the grid at once because only threads within a block must be executed in parallel. +Within a block of cores there still has to be enough memory to store all registers of all contained threads. +The independence of blocks allows applications to scale well across diverse devices. +As can be seen in the following figure, the accelerator can assign blocks of the task grid to blocks of cores in arbitrary order depending on availability and workload. + +.. image:: /images/block_scale.png + +Shared Memory ++++++++++++++ + +Each block has its own shared memory. +This memory can only be accessed explicitly by threads within the same block and gets discarded after the complete block finished its calculation. +This memory is typically very fast but also very small. +No variables are shared between kernels by default. + +Warp +```` + +With the current abstraction only independent parallelism via blocks and synchronizable parallelism via threads can be expressed. +However, there are more variants of parallelism in real hardware. +Because all threads in the grid are executing the same kernel and even the same instruction at the same time when ignoring divergent control flows, a lot of chip space can be saved. +Multiple threads can be executed in perfect synchronicity, which is also called lock-step. +A group of such threads executing the same instruction at the same time is called a warp . +All threads within a warp share a single instruction pointer (IP), and all cores executing the threads share one instruction fetch (IF) and instruction decode (ID) unit. + +.. image:: /images/warp.png + +Even threads with divergent control flows can be executed within one warp. +*CUDA*, for example, solves this by supporting predicated execution and warp voting. +For long conditional branches the compiler inserts code which checks if all threads in the warp take the same branch. +For small branches, where this is too expensive, all threads always execute all branches. +Control flow statements result in a predicate and only in those threads where it is true, the predicated instructions will have an effect. + +Not only *CUDA* GPUs support the execution of multiple threads in a warp. +Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*. + +Due to the synchronitiy of threads within a warp, memory operations will always occur at the same time in all threads. +This allows to coalesce memory accesses. +Different *CUDA* devices support different levels of memory coalescing. +Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices. +Newer ones support unaligned scattered accesses as long as they target the same 128 byte segment. + +The ability of very fast context switches between warps and a queue of ready warps allows *CUDA* capable GPUs to hide the latency of global memory operations. + +Element +``````` + +To use the maximum available computing power of, for example, a modern x86 processor, the computation has to utilize the SIMD vector registers. +Many current architectures support issuing a single instruction that can be applied to multiple data elements in parallel. + +The original x86 instruction set architecture did not support SIMD instructions but has been enhanced with MMX (64 bit width registers), SSE (128 bit width registers), AVX (256 bit width registers) and AVX-512 (512 bit width registers) extensions. +In varying degree, they allow to process multiple 32 bit and 64 bit floating point numbers as well as 8, 16, 32 and 64 bit signed and unsigned integers. + +*CUDA* capable GPUs do not have vector registers where multiple values of type ``float`` or ``double`` can be manipulated by one instruction. +Nevertheless, newer *CUDA* capable devices implement basic SIMD instructions on pairs of 16 bit values and quads of 8-bit values. +They are described in the documentation of the `PTX instruction set architecture `_ chapter 9.7.13 but are only of any use in very special problem domains, for example for deep learning. + +It would be optimal if the compiler could automatically vectorize our kernels when they are called in a loop and vectorization is supported by the underlying accelerator. +However, besides full blown vector processors, mainstream CPUs do not support predicated execution or similar complex things within vector registers. +At most, there is support for masking operations which allow to emulate at least some conditional branching. +Therefore, this missing hardware capability has to be circumvented by the compiler. +There are scientific research projects such as the work done by Ralf Karrenberg et al [`1 `_, `2 `_, `3 `_ ] building on the *LLVM* compiler infrastructure supporting such whole-function vectorization. +However, current mainstream compilers do not support automatic vectorization of basic, non trivial loops containing control flow statements (``if``, ``else``, ``for``, etc.) or other non-trivial memory operations. +Therefore, it has to be made easier for the compiler to recognize the vectorization possibilities by making it more explicit. + +The opposite of automatic whole function vectorization is the fully explicit vectorization of expressions via compiler intrinsics directly resulting in the desired assembly instruction. +A big problem when trying to utilize fully explicit vectorization is, that there is no common foundation supported by all explicit vectorization methods. +A wrapper unifying the x86 SIMD intrinsics found in the ``intrin.h`` or ``x86intrin.h`` headers with those supported on other platforms, for example ARM NEON (``arm_neon.h``), PowerPC Altivec (``altivec.h``) or *CUDA* is not available and to write one is a huge task in itself. +However, if this would become available in the future, it could easily be integrated into *alpaka* kernels. + +Due to current compilers being unable to vectorize whole functions and the explicit vectorization intrinsics not being portable, one has to rely on the vectorization capabilities of current compilers for primitive loops only consisting of a few computations. +By creating a grid of data elements, where multiple elements are processed per thread and threads are pooled in independent blocks, as it is shown in the figure below, the user is free to loop sequentially over the elements or to use vectorization for selected expressions within the kernel. +Even the sequential processing of multiple elements per thread can be useful depending on the architecture. +For example, the *NVIDIA cuBLAS* general matrix-matrix multiplication (GEMM) internally executes only one thread for each second matrix data element to better utilize the registers available per thread. + +.. image:: /images/element.png + +.. note:: + The best solution to vectorization would be one, where the user does not have to do anything. + This is not possible because the smallest unit supplied by the user is a kernel which is executed in threads which can synchronize. + + It is not possible to execute multiple kernels sequentially to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries. + This would prohibit the synchronization between these threads. + By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working. + +Summary +------- + +This abstraction is called *Redundant Hierarchical Parallelism*. +This term is inspired by the paper *The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?* +`PDF `_ +`DOI `_ +It investigates a similar *concept of copious parallel programming* reaching 80%-90% of the native performance while comparing CPU and GPU centric versions of an *OpenCL* n-body simulation with a general version utilizing parallelism on multiple hierarchy levels. + +The *CUDA* or *OpenCL* abstractions themselves are very similar to the one designed in the previous sections and consists of all but the Element level. +However, as has been shown, all five abstraction hierarchy levels are necessary to fully utilize current architectures. +By emulating unsupported or ignoring redundant levels of parallelism, algorithms written with this abstraction can always be mapped optimally to all supported accelerators. The following table summarizes the characteristics of the proposed hierarchy levels. + + +-----------------+-----------------------+----------------+ + | Hierarchy Level | Parallelism | Synchronizable | + +-----------------+-----------------------+----------------+ + | --- | --- | --- | + +-----------------+-----------------------+----------------+ + | grid | sequential / parallel | -- / X | + +-----------------+-----------------------+----------------+ + | block | parallel | -- | + +-----------------+-----------------------+----------------+ + | warp | parallel | X | + +-----------------+-----------------------+----------------+ + | thread | parallel / lock-step | X | + +-----------------+-----------------------+----------------+ + | element | sequential | -- | + +-----------------+-----------------------+----------------+ + +Depending on the queue a task is enqueued into, grids will either run in sequential order within the same queue or in parallel in different queues. +They can be synchronized by using events. +Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device. +Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain. +Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially. + diff --git a/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst b/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst new file mode 100644 index 0000000000..a6ceebf511 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst @@ -0,0 +1,281 @@ +Cheatsheet +========== + +.. only:: html + + Download pdf version :download:`here <../../cheatsheet/cheatsheet.pdf>` + +General +------- + +- Getting alpaka: https://github.com/alpaka-group/alpaka +- Issue tracker, questions, support: https://github.com/alpaka-group/alpaka/issues +- All alpaka names are in namespace alpaka and header file `alpaka/alpaka.hpp` +- This document assumes + + .. code-block:: c++ + + #include + using namespace alpaka; + +.. raw:: pdf + + Spacer 0,5 + +Accelerator and Device +---------------------- + +Define in-kernel thread indexing type + .. code-block:: c++ + + using Dim = dim::DimInt; + using Idx = IntegerType; + +Define accelerator type (CUDA, OpenMP,etc.) + .. code-block:: c++ + + using Acc = AcceleratorType; + + AcceleratorType: + .. code-block:: c++ + + acc::AccGpuCudaRt, + acc::AccCpuOmp2Blocks, + acc::AccCpuOmp2Threads, + acc::AccCpuOmp4, + acc::AccCpuTbbBlocks, + acc::AccCpuThreads, + acc::AccCpuFibers, + acc::AccCpuSerial + + +Select device for the given accelerator by index + .. code-block:: c++ + + auto const device = pltf::getDevByIdx(index); + + +Queue and Events +---------------- + +Create a queue for a device + .. code-block:: c++ + + using Queue = queue::Queue; + auto queue = Queue{device}; + + Property: + .. code-block:: c++ + + queue::Blocking + queue::NonBlocking + +Put a task for execution + .. code-block:: c++ + + queue::enqueue(queue, task); + +Wait for all operations in the queue + .. code-block:: c++ + + wait::wait(queue); + +Create an event + .. code-block:: c++ + + event::Event event{device}; + +Put an event to the queue + .. code-block:: c++ + + queue::enqueue(queue, event); + +Check if the event is completed + .. code-block:: c++ + + event::test(event); + +Wait for the event (and all operations put to the same queue before it) + .. code-block:: c++ + + wait::wait(event); + +Memory +------ + +Memory allocation and transfers are symmetric for host and devices, both done via alpaka API + +Create a CPU device for memory allocation on the host side + .. code-block:: c++ + + auto const devHost = pltf::getDevByIdx(0u); + +Allocate a buffer in host memory + .. code-block:: c++ + + vec::Vec extent = value; + using BufHost = mem::buf::Buf; + BufHost bufHost = mem::buf::alloc(devHost, extent); + +(Optional, affects CPU – GPU memory copies) Prepare it for asynchronous memory copies + .. code-block:: c++ + + mem::buf::prepareForAsyncCopy(bufHost); + +Get a raw pointer to a buffer initialization, etc. + .. code-block:: c++ + + DataType * raw = mem::view::getPtrNative(bufHost); + +Allocate a buffer in device memory + .. code-block:: c++ + + auto bufDevice = mem::buf::alloc(device, extent); + +Enqueue a memory copy from host to device + .. code-block:: c++ + + mem::view::copy(queue, bufDevice, bufHost, extent); + +Enqueue a memory copy from device to host + .. code-block:: c++ + + mem::view::copy(queue, bufHost, bufDevice, extent); + +.. raw:: pdf + + PageBreak + +Kernel Execution +---------------- + +Automatically select a valid kernel launch configuration + .. code-block:: c++ + + vec::Vec const globalThreadExtent = vectorValue; + vec::Vec const elementsPerThread = vectorValue; + + auto autoWorkDiv = workdiv::getValidWorkDiv( + device, + globalThreadExtent, elementsPerThread, + false, + workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); + +Manually set a kernel launch configuration + .. code-block:: c++ + + vec::Vec const blocksPerGrid = vectorValue; + vec::Vec const threadsPerBlock = vectorValue; + vec::Vec const elementsPerThread = vectorValue; + + using WorkDiv = workdiv::WorkDivMembers; + auto manualWorkDiv = WorkDiv{blocksPerGrid, + threadsPerBlock, + elementsPerThread}; + +Instantiate a kernel and create a task that will run it (does not launch it yet) + .. code-block:: c++ + + Kernel kernel{argumentsForConstructor}; + auto taskRunKernel = kernel::createTaskKernel(workDiv, + kernel, + parameters); + +acc parameter of the kernel is provided automatically, does not need to be specified here + +Put the kernel for execution + .. code-block:: c++ + + queue::enqueue(queue, taskRunKernel); + +Kernel Implementation +--------------------- + +Define a kernel as a C++ functor + .. code-block:: c++ + + struct Kernel { + template + ALPAKA_FN_ACC void operator()(Acc const & acc, parameters) const { ... } + }; + +``ALPAKA_FN_ACC`` is required for kernels and functions called inside, ``acc`` is mandatory first parameter, its type is the template parameter + +Access multi-dimensional indices and extents of blocks, threads, and elements + .. code-block:: c++ + + auto idx = idx::getIdx(acc); + auto extent = workdiv::getWorkdiv(acc); + + Origin: + .. code-block:: c++ + + Grid, Block, Thread + + Unit: + .. code-block:: c++ + + Blocks, Threads, Elems + +Access components of multi-dimensional indices and extents + .. code-block:: c++ + + auto idxX = idx[0]; + +Linearize multi-dimensional vectors + .. code-block:: c++ + + auto linearIdx = idx::mapIdx<1u>(idx, extent); + +.. raw:: pdf + + Spacer 0,8 + +Allocate static shared memory variable + .. code-block:: c++ + + Type & var = block::shared::st::allocVar(acc); + +Get dynamic shared memory pool, requires the kernel to specialize + .. code-block:: c++ + + kernel::traits::BlockSharedMemDynSizeBytes + Type * dynamicSharedMemoryPool = block::shared::dyn::getMem(acc); + +Synchronize threads of the same block + .. code-block:: c++ + + block::sync::syncBlockThreads(acc); + +Atomic operations + .. code-block:: c++ + + auto result = atomic::atomicOp(acc, + arguments, + OperationHierarchy); + + Operation (all in atomic::op): + .. code-block:: c++ + + namespace atomic::op + Add, Sub, Min, Max, Exch, Inc, Dec, And, Or, Xor, Cas + + OperationHierarchy (all in hierarchy): + .. code-block:: c++ + + namespace hierarchy + Threads, Blocks, Grids + +Math functions take acc as additional first argument + .. code-block:: c++ + + math::sin(acc, argument); + +Similar for other math functions. + +Generate random numbers + .. code-block:: c++ + + auto distribution = rand::distribution::createNormalReal(acc); + auto generator = rand::generator::createDefault(acc, seed, subsequence); + auto number = distribution(generator); diff --git a/thirdParty/cupla/alpaka/docs/source/basic/install.rst b/thirdParty/cupla/alpaka/docs/source/basic/install.rst new file mode 100644 index 0000000000..1d199c9571 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/basic/install.rst @@ -0,0 +1,46 @@ +.. highlight:: bash + +Installation +============ + +* Clone alpaka from github.com + +.. code-block:: + + git clone https://github.com/alpaka-group/alpaka + cd alpaka + +* Install alpaka + +.. code-block:: + + # git clone https://github.com/alpaka-group/alpaka + # cd alpaka + mkdir build && cd build + cmake -DCMAKE_INSTALL_PREFIX=/install/ .. + cmake --install . + +* Configure Accelerators + +.. code-block:: + + # .. + cmake -DALPAKA_ACC_GPU_CUDA_ENABLE=ON .. + +* Build an example + +.. code-block:: + + # .. + cmake -Dalpaka_BUILD_EXAMPLES=ON .. + make vectorAdd + ./example/vectorAdd/vectorAdd # execution + +* Build and run tests + +.. code-block:: + + # .. + cmake -DBUILD_TESTING=ON .. + make + ctest diff --git a/thirdParty/cupla/alpaka/docs/source/basic/intro.rst b/thirdParty/cupla/alpaka/docs/source/basic/intro.rst new file mode 100644 index 0000000000..fed72512bc --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/basic/intro.rst @@ -0,0 +1,90 @@ +Introduction +============ + +The *alpaka* library defines and implements an abstract interface for the *hierarchical redundant parallelism* model. +This model exploits task- and data-parallelism as well as memory hierarchies at all levels of current multi-core architectures. +This allows to achieve performance portability across various types of accelerators by ignoring specific unsupported levels and utilizing only the ones supported on a specific accelerator. +All hardware types (multi- and many-core CPUs, GPUs and other accelerators) are treated and can be programmed in the same way. +The *alpaka* library provides back-ends for *CUDA*, *OpenMP*, *Boost.Fiber* and other methods. +The policy-based C++ template interface provided allows for straightforward user-defined extension of the library to support other accelerators. + +The library name *alpaka* is an acronym standing for **A**\ bstraction **L**\ ibrary for **Pa**\ rallel **K**\ ernel **A**\ cceleration. + +Example +------- + +.. literalinclude:: ../../../example/helloWorld/src/helloWorld.cpp + :language: C++ + :caption: helloWorld.cpp + +.. code-block:: cmake + :caption: CMakeLists.txt + + cmake_minimum_required(VERSION 3.15) + + set(_TARGET_NAME helloWorld) + project(${_TARGET_NAME}) + + find_package(alpaka REQUIRED) + + alpaka_add_executable(${_TARGET_NAME} helloWorld.cpp) + target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) + +You can integrate alpaka into your project via ``find_package()`` in your ``CMakeLists.txt``. +This requires, that you :doc:`install ` alpaka. +If you do not install alpaka in a default path such as ``/usr/local/`` you have to set the ``CMake`` argument ``-Dalpaka_ROOT=/path/to/alpaka/install``. + +The cmake configuration decides which alpaka accelerators are available during compiling. For example, if you configure your ``cmake`` build with the CUDA back-end (``-DALPAKA_ACC_GPU_CUDA_ENABLE=ON``), ``cmake`` checks, if the CUDA SDK is available and if it found, the C++ template ``alpaka::acc::AccGpuCudaRt`` is available during compiling. + +About alpaka +------------ + +alpaka is ... +~~~~~~~~~~~~~ + +Abstract + It describes parallel execution on multiple hierarchy levels. It allows to implement a mapping to various hardware architectures but is no optimal mapping itself. + +Sustainable + *alpaka* decouples the application from the availability of different accelerator frameworks in different versions, such as OpenMP, CUDA, HIP, etc. (50% on the way to reach full performance portability). + +Heterogeneous + An identical algorithm / kernel can be executed on heterogeneous parallel systems by selecting the target device. This allows the best performance for each algorithm and/or a good utilization of the system without major code changes. + +Maintainable + *alpaka* allows to provide a single version of the algorithm / kernel that can be used by all back-ends. There is no need for "copy and paste" kernels with different API calls for different accelerators. All the accelerator dependent implementation details are hidden within the *alpaka* library. + +Testable + Due to the easy back-end switch, no special hardware is required for testing the kernels. Even if the simulation itself always uses the *CUDA* back-end, the tests can completely run on a CPU. As long as the *alpaka* library is thoroughly tested for compatibility between the acceleration back-ends, the user simulation code is guaranteed to generate identical results (ignoring rounding errors / non-determinism) and is portable without any changes. + +Optimizable + Everything in *alpaka* can be replaced by user code to optimize for special use-cases. + +Extensible + Every concept described by the *alpaka* abstraction can be implemented by users. Therefore it is possible to non-intrusively define new devices, queues, buffer types or even whole accelerator back-ends. + +Data Structure Agnostic + The user can use and define arbitrary data structures. + +alpaka does not ... +~~~~~~~~~~~~~~~~~~~ + +Automatically provide an optimal mapping of kernels to various acceleration platforms + Except in trivial examples an optimal execution always depends on suitable selected data structures. An adaptive selection of data structures is a separate topic that has to be implemented in a distinct library. + +Automatically optimize concurrent data access + *alpaka* does not provide feature to create optimized memory layouts. + +Handle differences in arithmetic operations + For example, due to **different rounding** or different implementations of floating point operations, results can differ slightly between accelerators. + +Guarantee determinism of results + Due to the freedom of the library to reorder or repartition the threads within the tasks it is not possible or even desired to preserve deterministic results. For example, the non-associativity of floating point operations give non-deterministic results within and across accelerators. + +The *alpaka* library is aimed at parallelization on shared memory, i.e. within nodes of a cluster. +It does not compete with libraries for distribution of processes across nodes and communication among those. +For these purposes libraries like MPI (Message Passing Interface) or others should be used. +MPI is situated one layer higher and can be combined with *alpaka* to facilitate the hardware of a whole heterogeneous cluster. +The *alpaka* library can be used for parallelization within nodes, MPI for parallelization across nodes. diff --git a/thirdParty/cupla/alpaka/docs/source/basic/library.rst b/thirdParty/cupla/alpaka/docs/source/basic/library.rst new file mode 100644 index 0000000000..7174fdde62 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/basic/library.rst @@ -0,0 +1,162 @@ +Library Interface +================= + +As described in the chapter about the :doc:`Abstraction `, the general design of the library is very similar to *CUDA* and *OpenCL* but extends both by some points, while not requiring any language extensions. +General interface design as well as interface implementation decisions differentiating *alpaka* from those libraries are described in the Rationale section. +It uses C++ because it is one of the most performant languages available on nearly all systems. +Furthermore, C++14 allows to describe the concepts in a very abstract way that is not possible with many other languages. +The *alpaka* library extensively makes use of advanced functional C++ template meta-programming techniques. +The Implementation Details section discusses the C++ library and the way it provides extensibility and optimizability. + +Structure +--------- + +The *alpaka* library allows offloading of computations from the host execution domain to the accelerator execution domain, whereby they are allowed to be identical. + +In the abstraction hierarchy the library code is interleaved with user supplied code as is depicted in the following figure. + +.. image:: /images/execution_domain.png + :alt: Execution Domains + +User code invokes library functions, which in turn execute the user provided thread function (kernel) in parallel on the accelerator. +The kernel in turn calls library functions when accessing accelerator properties and methods. +Additionally, the user can enhance or optimize the library implementations by extending or replacing specific parts. + +The *alpaka* abstraction itself only defines requirements a type has to fulfill to be usable with the template functions the library provides. +These type constraints are called concepts in C++. + +*A concept is a set of requirements consisting of valid expressions, associated types, invariants, and complexity guarantees. +A type that satisfies the requirements is said to model the concept. +A concept can extend the requirements of another concept, which is called refinement.* `BoostConcepts `_ + +Concepts allow to safely define polymorphic algorithms that work with objects of many different types. + +The *alpaka* library implements a stack of concepts and their interactions modeling the abstraction defined in the previous chapter. +Furthermore, default implementations for various devices and accelerators modeling those are included in the library. +The interaction of the main user facing concepts can be seen in the following figure. + +.. image:: /images/structure_assoc.png + :alt: user / alpaka code interaction + + +For each type of ``Device`` there is a ``Platform`` for enumerating the available ``Device``s. +A ``Device`` is the requirement for creating ``Queues`` and ``Events`` as it is for allocating ``Buffers`` on the respective ``Device``. ``Buffers`` can be copied, their memory be set and they can be pinned or mapped. +Copying and setting a buffer requires the corresponding ``Copy`` and ``Set`` tasks to be enqueued into the ``Queue``. +An ``Event`` can be enqueued into a ``Queue`` and its completion state can be queried by the user. +It is possible to wait for (synchronize with) a single ``Event``, a ``Queue`` or a whole ``Device``. +An ``Executor`` can be enqueued into a ``Queue`` and will execute the ``Kernel`` (after all previous tasks in the queue have been completed). +The ``Kernel`` in turn has access to the ``Accelerator`` it is running on. +The ``Accelerator`` provides the ``Kernel`` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more. + + +Interface Usage +--------------- + +Accelerator Functions +````````````````````` + +Functions that should be executable on an accelerator have to be annotated with the execution domain (one of ``ALPAKA_FN_HOST``, ``ALPAKA_FN_ACC`` and ``ALPAKA_FN_HOST_ACC``). +They most probably also require access to the accelerator data and methods, such as indices and extents as well as functions to allocate shared memory and to synchronize all threads within a block. +Therefore the accelerator has to be passed in as a templated constant reference parameter as can be seen in the following code snippet. + +.. code-block:: cpp + + template< + typename TAcc> + ALPAKA_FN_ACC auto doSomethingOnAccelerator( + TAcc const & acc/*, + ...*/) // Arbitrary number of parameters + -> int // Arbitrary return type + { + //... + } + + +Kernel Definition +````````````````` + +A kernel is a special function object which has to conform to the following requirements: + +* it has to fulfill the ``std::is_trivially_copyable`` trait (has to be copyable via memcpy) +* the ``operator()`` is the kernel entry point + * it has to be an accelerator executable function + * it has to return ``void``. + * its first argument has to be the accelerator (templated for arbitrary accelerator back-ends). + +The following code snippet shows a basic example of a kernel function object. + +.. code-block:: cpp + + struct MyKernel + { + template< + typename TAcc> // Templated on the accelerator type. + ALPAKA_FN_ACC // Macro marking the function to be executable on all accelerators. + auto operator()( // The function / kernel to execute. + TAcc const & acc/*, // The specific accelerator implementation. + ...*/) const // Must be 'const'. + -> void + { + //... + } + // Class can have members but has to be std::is_trivially_copyable. + // Classes must not have pointers or references to host memory! + }; + +The kernel function object is shared across all threads in all blocks. +Due to the block execution order being undefined, there is no safe and consistent way of altering state that is stored inside of the function object. +Therefore, the ``operator()`` of the kernel function object has to be ``const`` and is not allowed to modify any of the object members. + + +Index and Work Division +``````````````````````` + +The ``alpaka::workdiv::getWorkDiv`` and the ``alpaka::idx::getIdx`` functions both return a vector of the dimensionality the accelerator has been defined with. +They are parametrized by the origin of the calculation as well as the unit in which the values are calculated. +For example, ``alpaka::workdiv::getWorkDiv(acc)`` returns a vector with the extents of the grid in units of threads. + + +Memory Management +````````````````` + +The memory allocation function of the *alpaka* library (``alpaka::mem::buf::alloc(device, extents)``) is uniform for all devices, even for the host device. +It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks. +Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on. +This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (``alpaka::mem::view::copy(bufDevA, bufDevB, copyExtents``). + +Kernel Execution +```````````````` + +The following source code listing shows the execution of a kernel by enqueuing the execution task into a queue. + +.. code-block:: cpp + + // Define the dimensionality of the task. + using Dim = alpaka::dim::DimInt<1u>; + // Define the type of the indexes. + using Idx = std::size_t; + // Define the accelerator to use. + using Acc = alpaka::acc::AccCpuSerial; + // Select the queue type. + using Queue = alpaka::queue::QueueCpuNonBlocking; + + // Select a device to execute on. + auto devAcc(alpaka::pltf::getDevByIdx(0)); + // Create a queue to enqueue the execution into. + Queue queue(devAcc); + + // Create a 1-dimensional work division with 256 blocks a 16 threads. + auto const workDiv(alpaka::workdiv::WorkDivMembers(256u, 16u); + // Create an instance of the kernel function object. + MyKernel kernel; + // Enqueue the execution task into the queue. + alpaka::kernel::exec(queue, workDiv, kernel/*, arguments ...*/); + +The dimensionality of the task as well as the type for index and extent have to be defined explicitly. +Following this, the type of accelerator to execute on, as well as the type of the queue have to be defined. +For both of these types instances have to be created. +For the accelerator this has to be done indirectly by enumerating the required device via the device manager, whereas the queue can be created directly. + +To execute the kernel, an instance of the kernel function object has to be constructed. +Following this, an execution task combining the work division (grid and block sizes) with the kernel function object and the bound invocation arguments has to be created. +After that this task can be enqueued into a queue for immediate or later execution (depending on the queue used). diff --git a/thirdParty/cupla/alpaka/docs/source/conf.py b/thirdParty/cupla/alpaka/docs/source/conf.py new file mode 100644 index 0000000000..1ecd8094d0 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/conf.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- +# Configuration file for the Sphinx documentation builder. + +import os +import subprocess + + +# -- Project information ----------------------------------------------------- + +project = 'alpaka' +copyright = 'Documentation under CC-BY 4.0, Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann' +author = 'Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann' +# The short X.Y version. +version = u'0.5.0' +# The full version, including alpha/beta/rc tags. +release = u'0.5.0' + +# The master toctree document. +master_doc = 'index' + +# -- General configuration --------------------------------------------------- + +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +show_authors = True + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.mathjax', +# 'sphinx.ext.napoleon', + 'breathe', + 'sphinxcontrib.programoutput', +# 'matplotlib.sphinxext.plot_directive' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["Thumbs.db", ".DS_Store"] + +source_suffix = ['.rst'] +master_doc = 'index' +language = None + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' #'default' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# modifies the HTML Sphinx Doc layout +html_css_files = ["custom.css"] + +html_logo = "../logo/alpaka.svg" +html_theme_options = { + "logo_only" : True +} + +# -- Options for HTMLHelp output --------------------------------------------- + +htmlhelp_basename = 'alpakadoc' + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + 'papersize': 'a4paper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + 'preamble': r'\setcounter{tocdepth}{2}', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} +latex_logo = "../logo/alpaka.pdf" + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'alpaka-doc.tex', u'alpaka Documentation', + u'The alpaka Community', 'manual'), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'alpaka', u'alpaka Documentation', + [author], 1) +] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'alpaka', u'alpaka Documentation', + author, 'alpaka', 'Abstraction Library for Parallel Kernel Acceleration', + """ + The alpaka library is a header-only C++14 abstraction library for + accelerator development. Its aim is to provide performance portability + across accelerators through the abstraction (not hiding!) of the underlying + levels of parallelism. + """), +] + +# -- Options for Epub output ------------------------------------------------- + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +breathe_projects = { "alpaka": "../doxygen/xml" } +breathe_default_project = "alpaka" + +breathe_domain_by_extension = { + "cpp": "cpp", + "h": "cpp", + "hpp": "cpp", + "tpp": "cpp" +} + +# define alpaka attributes +# breath has problems to parse C++ attributes +cpp_id_attributes = ["ALPAKA_FN_ACC", + "ALPAKA_FN_HOST", + "ALPAKA_FN_HOST_ACC", + "ALPAKA_FN_INLINE", + "ALPAKA_NO_HOST_ACC_WARNING", + "ALPAKA_STATIC_ACC_MEM_CONSTANT", + "ALPAKA_STATIC_ACC_MEM_GLOBAL", + ] + +# -- processing -- + +if on_rtd: + subprocess.call('cd ..; doxygen', shell=True) + subprocess.call('cd ../cheatsheet; rst2pdf -s cheatsheet.style ../source/basic/cheatsheet.rst -o cheatsheet.pdf', shell=True) +else: + import sphinx_rtd_theme + html_theme = "sphinx_rtd_theme" + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/thirdParty/cupla/alpaka/docs/source/dev/backends.rst b/thirdParty/cupla/alpaka/docs/source/dev/backends.rst new file mode 100644 index 0000000000..590343f4e7 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/dev/backends.rst @@ -0,0 +1,650 @@ +.. highlight:: bash + +Back-ends +========= + +Accelerator Implementations +``````````````````````````` +The table shows which native implementation or information is used to represent an alpaka functionality. + +.. table:: + + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | alpaka | Serial | std::thread | Boost.Fiber | OpenMP 2.0 | OpenMP 4.0 | CUDA 9.0+ | + +===============================================================+===============================================+=================================================================================+================================================================================+=====================================================================================+=======================================================================================================================================+==================================================+ + | Devices | Host Core | Host Cores | Host Core | Host Cores | Host Cores | NVIDIA GPUs | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | Lib/API | standard C++ | std::thread | boost::fibers::fiber | OpenMP 2.0 | OpenMP 4.0 | CUDA 9.0+ | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | Kernel execution | sequential | std::thread(kernel) | boost::fibers::fiber(kernel) | omp_set_dynamic(0), #pragma omp parallel num_threads(iNumKernelsInBlock) | #pragma omp target, #pragma omp teams num_teams(...) thread_limit(...), #pragma omp distribute, #pragma omp parallel num_threads(...) | cudaConfigureCall, cudaSetupArgument, cudaLaunch | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | Execution strategy grid-blocks | sequential | sequential | sequential | sequential | undefined | undefined | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | Execution strategy block-kernels | sequential | preemptive multitasking | cooperative multithreading | preemptive multitasking | preemptive multitasking | lock-step within warps | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | getIdx | emulated | block-kernel: mapping of std::this_thread::get_id() grid-block: member variable | block-kernel: mapping of std::this_fiber::get_id() grid-block: member variable | block-kernel: omp_get_num_threads() to 3D index mapping grid-block: member variable | block-kernel: omp_get_num_threads() to 3D index mapping grid-block: member variable | threadIdx, blockIdx | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | getExtent | member variables | member variables | member variables | member variables | member variables | gridDim, blockDim | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | getBlockSharedExternMem | allocated in memory prior to kernel execution | allocated in memory prior to kernel execution | allocated in memory prior to kernel execution | allocated in memory prior to kernel execution | allocated in memory prior to kernel execution | __shared__ | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | allocBlockSharedMem | master thread allocates | syncBlockKernels -> master thread allocates -> syncBlockKernels | syncBlockKernels -> master thread allocates -> syncBlockKernels | syncBlockKernels -> master thread allocates -> syncBlockKernels | syncBlockKernels -> master thread allocates -> syncBlockKernels | __shared__ | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | syncBlockKernels | not required | barrier | barrier | #pragma omp barrier | #pragma omp barrier | __syncthreads | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | atomicOp | hierarchy depended | std::lock_guard< std::mutex > | n/a | #pragma omp critical | #pragma omp critical | atomicXXX | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + | ALPAKA_FN_HOST_ACC, ALPAKA_FN_ACC, ALPAKA_FN_HOST | inline | inline | inline | inline | inline | __device__, __host__, __forceinline__ | + +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+ + +Serial +`````` + +The serial accelerator only allows blocks with exactly one thread. +Therefore it does not implement real synchronization or atomic primitives. + +Threads +``````` + +Execution ++++++++++ + +To prevent recreation of the threads between execution of different blocks in the grid, the threads are stored inside a thread pool. +This thread pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage and lots of idling kernel-threads when there are multiple KernelExecutors around. +Because the default policy of the threads in the pool is to yield instead of waiting, this would also slow down the system immensely. + +Fibers +`````` + +Execution ++++++++++ + +To prevent recreation of the fibers between execution of different blocks in the grid, the fibers are stored inside a fibers pool. +This fiber pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage when there are multiple KernelExecutors around. + +OpenMP +`````` + +Execution ++++++++++ + +Parallel execution of the kernels in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line. +So we have to spawn one real thread per kernel in a block. +``omp for`` is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required. +Therefore we use ``omp parallel`` with the specified number of threads in a block. +Another reason for not using ``omp for`` like ``#pragma omp parallel for collapse(3) num_threads(blockDim.x*blockDim.y*blockDim.z)`` is that ``#pragma omp barrier`` used for intra block synchronization is not allowed inside ``omp for`` blocks. + +Because OpenMP is designed for a 1:1 abstraction of hardware to software threads, the block size is restricted by the number of OpenMP threads allowed by the runtime. +This could be as little as 2 or 4 kernels but on a system with 4 cores and hyper-threading OpenMP can also allow 64 threads. + +Index ++++++ + +OpenMP only provides a linear thread index. This index is converted to a 3 dimensional index at runtime. + +Atomic +++++++ + +We can not use ``#pragma omp atomic`` because braces or calling other functions directly after ``#pragma omp atomic`` are not allowed. +Because we are implementing the CUDA atomic operations which return the old value, this requires ``#pragma omp critical`` to be used. +``omp_set_lock`` is an alternative but is usually slower. + +CUDA +```` + +Nearly all CUDA functionality can be directly mapped to alpaka function calls. +A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. In both cases x is the innermost / fast running index. + +Furthermore alpaka does not require the indices and extents to be 3-dimensional. +The accelerators are templatized on and support arbitrary dimensionality. +NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions! + +NOTE: You have to be careful when mixing alpaka and non alpaka CUDA code. The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function. + + +Programming Interface +--------------------- + +*Function Attributes* + +.. table:: + + +-----------------------------------------------------+---------------------------------------------------------+ + | CUDA | alpaka | + +=====================================================+=========================================================+ + | ``__host__`` | ``ALPAKA_FN_HOST`` | + +-----------------------------------------------------+---------------------------------------------------------+ + | ``__device__`` | ``ALPAKA_FN_ACC`` | + +-----------------------------------------------------+---------------------------------------------------------+ + | ``__global__`` | ``ALPAKA_FN_ACC`` | + +-----------------------------------------------------+---------------------------------------------------------+ + | ``__host__ __device__`` | ``ALPAKA_FN_HOST_ACC`` | + +-----------------------------------------------------+---------------------------------------------------------+ + +.. note:: + + You can not call CUDA-only methods, except when ``ALPAKA_ACC_GPU_CUDA_ONLY_MODE`` is enabled. + +*Memory* + +.. table:: + + +-----------------------------------------------------+----------------------------------------------------------------------------+ + | CUDA | alpaka | + +=====================================================+============================================================================+ + | ``__shared__`` | ``alpaka::block::shared::st::allocVar(acc)`` | + +-----------------------------------------------------+----------------------------------------------------------------------------+ + | ``__constant__`` | ``ALPAKA_STATIC_ACC_MEM_CONSTANT`` | + +-----------------------------------------------------+----------------------------------------------------------------------------+ + | ``__device__`` | ``ALPAKA_STATIC_ACC_MEM_GLOBAL`` | + +-----------------------------------------------------+----------------------------------------------------------------------------+ + +.. doxygenfunction:: alpaka::block::shared::st::allocVar + :project: alpaka + +.. doxygendefine:: ALPAKA_STATIC_ACC_MEM_CONSTANT + :project: alpaka + +.. doxygendefine:: ALPAKA_STATIC_ACC_MEM_GLOBAL + :project: alpaka + +*Index / Work Division* + +.. table:: + + +---------------------------------+----------------------------------------------------------------------------------+ + | CUDA | alpaka | + +=================================+==================================================================================+ + | ``threadIdx`` | ``alpaka::idx::getIdx(acc)`` | + +---------------------------------+----------------------------------------------------------------------------------+ + | ``blockIdx`` | ``alpaka::idx::getIdx(acc)`` | + +---------------------------------+----------------------------------------------------------------------------------+ + | ``blockDim`` | ``alpaka::workdiv::getWorkDiv(acc)`` | + +---------------------------------+----------------------------------------------------------------------------------+ + | ``gridDim`` | ``alpaka::workdiv::getWorkDiv(acc)`` | + +---------------------------------+----------------------------------------------------------------------------------+ + | ``warpSize`` | ``alpaka::warp::getSize(acc)`` | + +---------------------------------+----------------------------------------------------------------------------------+ + +*Types* + +.. table:: + + +----------+-------------------------------------+ + | CUDA | alpaka | + +==========+=====================================+ + | ``dim3`` | ``alpaka::vec::Vec< TDim, TVal >`` | + +----------+-------------------------------------+ + + + +CUDA Runtime API +++++++++++++++++ + +The following tables list the functions available in the `CUDA Runtime API `_ and their equivalent alpaka functions: + +*Device Management* + +.. table:: + + +---------------------------------+-----------------------------------------------------------------------+ + | CUDA | alpaka | + +=================================+=======================================================================+ + | cudaChooseDevice | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetAttribute | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetByPCIBusId | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetCacheConfig | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetLimit | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetP2PAttribute | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetPCIBusId | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetSharedMemConfig | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceGetQueuePriorityRange | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceReset | alpaka::dev::reset(device) | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceSetCacheConfig | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceSetLimit | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceSetSharedMemConfig | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaDeviceSynchronize | void alpaka::wait::wait(device) | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaGetDevice | n/a (no current device) | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaGetDeviceCount | std::sizet alpaka::pltf::getDevCount< TPltf >() | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaGetDeviceFlags | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaGetDeviceProperties | alpaka::acc::getAccDevProps(dev) (Only some properties available) | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaIpcCloseMemHandle | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaIpcGetEventHandle | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaIpcGetMemHandle | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaIpcOpenEventHandle | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaIpcOpenMemHandle | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaSetDevice | n/a (no current device) | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaSetDeviceFlags | -- | + +---------------------------------+-----------------------------------------------------------------------+ + | cudaSetValidDevices | -- | + +---------------------------------+-----------------------------------------------------------------------+ + + +*Error Handling* + +.. table:: + + +---------------------+----------------------------------------------------------+ + | CUDA | alpaka | + +=====================+==========================================================+ + | cudaGetErrorName | n/a (handled internally, available in exception message) | + +---------------------+----------------------------------------------------------+ + | cudaGetErrorString | n/a (handled internally, available in exception message) | + +---------------------+----------------------------------------------------------+ + | cudaGetLastError | n/a (handled internally) | + +---------------------+----------------------------------------------------------+ + | cudaPeekAtLastError | n/a (handled internally) | + +---------------------+----------------------------------------------------------+ + + +*Queue Management* + +.. table:: + + +------------------------------+---------------------------------------------------------+ + | CUDA | alpaka | + +==============================+=========================================================+ + | cudaStreamAddCallback | alpaka::queue::enqueue(queue, [](){dosomething();}) | + +------------------------------+---------------------------------------------------------+ + | cudaStreamAttachMemAsync | -- | + +------------------------------+---------------------------------------------------------+ + | cudaStreamCreate | - queue=alpaka::queue::QueueCudaRtNonBlocking(device); | + | \ | - queue=alpaka::queue::QueueCudaRtBlocking(device); | + +------------------------------+---------------------------------------------------------+ + | cudaStreamCreateWithFlags | see cudaStreamCreate (cudaStreamNonBlocking hard coded) | + +------------------------------+---------------------------------------------------------+ + | cudaStreamCreateWithPriority | -- | + +------------------------------+---------------------------------------------------------+ + | cudaStreamDestroy | n/a (Destructor) | + +------------------------------+---------------------------------------------------------+ + | cudaStreamGetFlags | -- | + +------------------------------+---------------------------------------------------------+ + | cudaStreamGetPriority | -- | + +------------------------------+---------------------------------------------------------+ + | cudaStreamQuery | bool alpaka::queue::empty(queue) | + +------------------------------+---------------------------------------------------------+ + | cudaStreamSynchronize | void alpaka::wait::wait(queue) | + +------------------------------+---------------------------------------------------------+ + | cudaStreamWaitEvent | void alpaka::wait::wait(queue, event) | + +------------------------------+---------------------------------------------------------+ + +*Event Management* + +.. table:: + + +--------------------------+--------------------------------------------+ + | CUDA | alpaka | + +==========================+============================================+ + | cudaEventCreate | alpaka::event::Event< TQueue > event(dev); | + +--------------------------+--------------------------------------------+ + | cudaEventCreateWithFlags | -- | + +--------------------------+--------------------------------------------+ + | cudaEventDestroy | n/a (Destructor) | + +--------------------------+--------------------------------------------+ + | cudaEventElapsedTime | -- | + +--------------------------+--------------------------------------------+ + | cudaEventQuery | bool alpaka::event::test(event) | + +--------------------------+--------------------------------------------+ + | cudaEventRecord | void alpaka::queue::enqueue(queue, event) | + +--------------------------+--------------------------------------------+ + | cudaEventSynchronize | void alpaka::wait::wait(event) | + +--------------------------+--------------------------------------------+ + +*Memory Management* + +.. table:: + + +----------------------------+--------------------------------------------------------------------------------------------+ + | CUDA | alpaka | + +============================+============================================================================================+ + | cudaArrayGetInfo | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaFree | n/a (automatic memory management with reference counted memory handles) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaFreeArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaFreeHost | n/a | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaFreeMipmappedArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaGetMipmappedArrayLevel | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaGetSymbolAddress | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaGetSymbolSize | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaHostAlloc | n/a, the existing buffer can be pinned using alpaka::mem::buf::prepareForAsyncCopy(memBuf) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaHostGetDevicePointer | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaHostGetFlags | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaHostRegister | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaHostUnregister | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMalloc | alpaka::mem::buf::alloc(device, extents1D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMalloc3D | alpaka::mem::buf::alloc(device, extents3D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMalloc3DArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMallocArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMallocHost | alpaka::mem::buf::alloc(device, extents) 1D, 2D, 3D suppoorted! | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMallocManaged | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMallocMipmappedArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMallocPitch | alpaka::mem::alloc(device, extents2D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemAdvise | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemGetInfo | - alpaka::dev::getMemBytes | + | | - alpaka::dev::getFreeMemBytes | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemPrefetchAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemRangeGetAttribute | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemRangeGetAttributes | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy | alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2D | alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DArrayToArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DAsync | alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DFromArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DFromArrayAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DToArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy2DToArrayAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy3D | alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy3DAsync | alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy3DPeer | alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpy3DPeerAsync | alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyArrayToArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyAsync | alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyFromArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyFromArrayAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyFromSymbol | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyFromSymbolAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyPeer | alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyPeerAsync | alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyToArray | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyToArrayAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyToSymbol | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyToSymbolAsync | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemset | alpaka::mem::view::set(memBufDst, byte, extents1D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemset2D | alpaka::mem::view::set(memBufDst, byte, extents2D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemset2DAsync | alpaka::mem::view::set(memBufDst, byte, extents2D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemset3D | alpaka::mem::view::set(memBufDst, byte, extents3D) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemset3DAsync | alpaka::mem::view::set(memBufDst, byte, extents3D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemsetAsync | alpaka::mem::view::set(memBufDst, byte, extents1D, queue) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | makecudaExtent | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | makecudaPitchedPtr | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | makecudaPos | -- | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyHostToDevice | n/a (direction of copy is determined automatically) | + +----------------------------+--------------------------------------------------------------------------------------------+ + | cudaMemcpyDeviceToHost | n/a (direction of copy is determined automatically) | + +----------------------------+--------------------------------------------------------------------------------------------+ + + +*Execution Control* + +.. table:: + + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | CUDA | alpaka | + +============================+==============================================================================================================+ + | cudaFuncGetAttributes | -- | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | cudaFuncSetCacheConfig | -- | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | cudaFuncSetSharedMemConfig | -- | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | cudaLaunchKernel | - alpaka::kernel::exec(queue, workDiv, kernel, params...) | + | \ | - alpaka::kernel::BlockSharedExternMemSizeBytes< TKernel >::getBlockSharedExternMemSizeBytes<...>(...) | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | cudaSetDoubleForDevice | n/a (alpaka assumes double support) | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + | cudaSetDoubleForHost | n/a (alpaka assumes double support) | + +----------------------------+--------------------------------------------------------------------------------------------------------------+ + +*Occupancy* + +.. table:: + + +--------------------------------------------------------+--------+ + | CUDA | alpaka | + +========================================================+========+ + | cudaOccupancyMaxActiveBlocksPerMultiprocessor | -- | + +--------------------------------------------------------+--------+ + | cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags | -- | + +--------------------------------------------------------+--------+ + + +*Unified Addressing* + +.. table:: + + +--------------------------+--------+ + | CUDA | alpaka | + +==========================+========+ + | cudaPointerGetAttributes | -- | + +--------------------------+--------+ + + +*Peer Device Memory Access* + +.. table:: + + +-----------------------------+----------------------------------+ + | CUDA | alpaka | + +=============================+==================================+ + | cudaDeviceCanAccessPeer | -- | + +-----------------------------+----------------------------------+ + | cudaDeviceDisablePeerAccess | -- | + +-----------------------------+----------------------------------+ + | cudaDeviceEnablePeerAccess | automatically done when required | + +-----------------------------+----------------------------------+ + +**OpenGL, Direct3D, VDPAU, EGL, Graphics Interoperability** + +*not available* + +**Texture/Surface Reference/Object Management** + +*not available* + +**Version Management** + +*not available* + + + +HIP +``` + +.. warning:: + + The HIP documentation is outdated and must be overworked. + +Current Restrictions on HCC platform +++++++++++++++++++++++++++++++++++++ + +- Workaround for unsupported ``syncthreads_{count|and|or}``. + + - Uses temporary shared value and atomics + +- Workaround for buggy ``hipStreamQuery``, ``hipStreamSynchronize``. + + - Introduces own queue management + - ``hipStreamQuery`` and ``hipStreamSynchronize`` do not work in multithreaded environment + +- Workaround for missing ``cuStreamWaitValue32``. + + - Polls value each 10 ms + +- Device constant memory not supported yet +- Note that ``printf`` in kernels is still not supported in HIP +- Exclude ``hipMalloc3D`` and ``hipMallocPitch`` when size is zero otherwise they throw an Unknown Error +- ``TestAccs`` excludes 3D specialization of HIP back-end for now because ``verifyBytesSet`` fails in ``memView`` for 3D specialization +- ``dim3`` structure is not available on device (use ``alpaka::vec::Vec`` instead) +- Constructors' attributes unified with destructors'. + + - Host/device signature must match in HIP(HCC) + +- A chain of functions must also provide correct host-device signatures + + - E.g. a host function cannot be called from a host-device function + +- Recompile your target when HCC linker returned the error: + "File format not recognized + clang-7: error: linker command failed with exit code 1" +- If compile-error occurred the linker still may link, but without the device code +- AMD device architecture currently hardcoded in ``alpakaConfig.cmake`` + +Compiling HIP from Source ++++++++++++++++++++++++++ + +Follow `HIP Installation`_ guide for installing HIP. +HIP requires either *nvcc* or *hcc* to be installed on your system (see guide for further details). + +.. _HIP Installation: https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md + +- If you want the HIP binaries to be located in a directory that does not require superuser access, be sure to change the install directory of HIP by modifying the ``CMAKE_INSTALL_PREFIX`` cmake variable. +- Also, after the installation is complete, add the following line to the ``.profile`` file in your home directory, in order to add the path to the HIP binaries to PATH: ``PATH=$PATH:`` + +.. code-block:: + + git clone --recursive https://github.com/ROCm-Developer-Tools/HIP.git + cd HIP + mkdir -p build + cd build + cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX=${YOUR_HIP_INSTALL_DIR} -DBUILD_TESTING=OFF .. + make + make install + +- Set the appropriate paths (edit ``${YOUR_**}`` variables) + +.. code-block:: + + # HIP_PATH required by HIP tools + export HIP_PATH=${YOUR_HIP_INSTALL_DIR} + # Paths required by HIP tools + export CUDA_PATH=${YOUR_CUDA_ROOT} + # - if required, path to HCC compiler. Default /opt/rocm/hcc. + export HCC_HOME=${YOUR_HCC_ROOT} + # - if required, path to HSA include, lib. Default /opt/rocm/hsa. + export HSA_PATH=${YOUR_HSA_PATH} + # HIP binaries and libraries + export PATH=${HIP_PATH}/bin:$PATH + export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${LD_LIBRARY_PATH} + +- Test the HIP binaries + +.. code-block:: + + # calls nvcc or hcc + which hipcc + hipcc -V + which hipconfig + hipconfig -v + + +Verifying HIP Installation +++++++++++++++++++++++++++ + +- If PATH points to the location of the HIP binaries, the following command should list several relevant environment variables, and also the selected compiler on your ``system-\`hipconfig -f\``` +- Compile and run the `square sample`_, as pointed out in the original `HIP install guide`_. + +.. _square sample: https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square +.. _HIP install guide: https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#user-content-verify-your-installation + +Compiling Examples with HIP Back End +++++++++++++++++++++++++++++++++++++ + +As of now, the back-end has only been tested on the NVIDIA platform. + +* NVIDIA Platform + + * One issue in this branch of alpaka is that the host compiler flags don't propagate to the device compiler, as they do in CUDA. This is because a counterpart to the ``CUDA_PROPAGATE_HOST_FLAGS`` cmake variable has not been defined in the FindHIP.cmake file. + alpaka forwards the host compiler flags in cmake to the ``HIP_NVCC_FLAGS`` cmake variable, which also takes user-given flags. To add flags to this variable, toggle the advanced mode in ``ccmake``. + + +Random Number Generator Library rocRAND for HIP Back End +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +*rocRAND* provides an interface for HIP, where the cuRAND or rocRAND API is called depending on the chosen HIP platform (can be configured with cmake in alpaka). + +Clone the rocRAND repository, then build and install it + +.. code-block:: + + git clone https://github.com/ROCmSoftwarePlatform/rocRAND + cd rocRAND + mkdir -p build + cd build + cmake -DCMAKE_INSTALL_PREFIX=${HIP_PATH} -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DCMAKE_MODULE_PATH=${HIP_PATH}/cmake .. + make + + +The ``CMAKE_MODULE_PATH`` is a cmake variable for locating module finding scripts like *FindHIP.cmake*. +The paths to the *rocRAND* library and include directories should be appended to the ``CMAKE_PREFIX_PATH`` variable. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md b/thirdParty/cupla/alpaka/docs/source/dev/details.rst similarity index 55% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md rename to thirdParty/cupla/alpaka/docs/source/dev/details.rst index cad30d6434..0e426b0624 100644 --- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md +++ b/thirdParty/cupla/alpaka/docs/source/dev/details.rst @@ -1,9 +1,10 @@ -[:arrow_up: Up](../Library.md) +.. highlight:: cpp Details ======= -![Overview of the structure of the *alpaka* library with concepts and implementations.](structure.png) +.. image:: /images/structure.png + :alt: Overview of the structure of the *alpaka* library with concepts and implementations. The full stack of concepts defined by the *alpaka* library and their inheritance hierarchy is shown in the third column of the preceding figure. Default implementations for those concepts can be seen in the blueish columns. @@ -24,35 +25,35 @@ They allow arbitrary types as parameters, as long as they model the required con C++ provides a language inherent object oriented abstraction allowing to check that parameters to a function comply with the concept they are required to model. By defining interface classes, which model the *alpaka* concepts, the user would be able to inherit his extension classes from the interfaces he wants to model and implement the abstract virtual methods the interfaces define. The *alpaka* functions in turn would use the corresponding interface types as their parameter types. -For example, the `Buffer` concept requires methods for getting the pitch or changing the memory pinning state. -With this intrusive object oriented design pattern the `BufCpu` or `BufCudaRt` classes would have to inherit from an `IBuffer` interface and implement the abstract methods it declares. +For example, the ``Buffer`` concept requires methods for getting the pitch or changing the memory pinning state. +With this intrusive object oriented design pattern the ``BufCpu`` or ``BufCudaRt`` classes would have to inherit from an ``IBuffer`` interface and implement the abstract methods it declares. An example of this basic pattern is shown in the following source snippet: -```C++ -struct IBuffer -{ - virtual std::size_t getPitch() const = 0; - virtual void pin() = 0; - virtual void unpin() = 0; - ... -}; - -struct BufCpu : public IBuffer -{ - virtual std::size_t getPitch() const override { ... } - virtual void pin() override { ... } - virtual void unpin() override { ... } - ... -}; - -ALPAKA_FN_HOST auto copy( - IBuffer & dst, - IBuffer const & src) --> void -{ - ... -} -``` +.. code-block:: + + struct IBuffer + { + virtual std::size_t getPitch() const = 0; + virtual void pin() = 0; + virtual void unpin() = 0; + ... + }; + + struct BufCpu : public IBuffer + { + virtual std::size_t getPitch() const override { ... } + virtual void pin() override { ... } + virtual void unpin() override { ... } + ... + }; + + ALPAKA_FN_HOST auto copy( + IBuffer & dst, + IBuffer const & src) + -> void + { + ... + } The compiler can then check at compile time that the objects the user wants to use as function parameters can be implicitly cast to the interface type, which is the case for inherited base classes. The compiler returns an error message on a type mismatch. @@ -60,128 +61,130 @@ However, if the *alpaka* library were using those language inherent object orien Classes and run-time polymorphism require the implementer of extensions to intrusively inherit from predefined interfaces and override special virtual functions. This is feasible for user defined classes or types where the source code is available and where it can be changed. -The `std::vector` class template on the other hand would not be able to model the `Buffer` concept because we can not change its definition to inherit from the `IBuffer` interface class since it is part of the standard library. +The ``std::vector`` class template on the other hand would not be able to model the ``Buffer`` concept because we can not change its definition to inherit from the ``IBuffer`` interface class since it is part of the standard library. The standard inheritance based object orientation of C++ only works well when all the code it is to interoperate with can be changed to implement the interfaces. It does not enable interaction with unalterable or existing code that is too complex to change, which is the reality in the majority of software projects. Another option to implement an extensible library is to follow the way the C++ standard library uses. It allows to specialize function templates for user types to model concepts without altering the types themselves. -For example, the `std::begin` and `std::end` free function templates can be specialized for user defined types. -With those functions specialized, the C++11 range-based for loops (`for(auto & i : userContainer){...}`) see *C++ Standard 6.5.4/1* can be used with user defined types. -Equally specializations of `std::swap` and other standard library function templates can be defined to extend those with support for user types. +For example, the ``std::begin`` and ``std::end`` free function templates can be specialized for user defined types. +With those functions specialized, the C++11 range-based for loops (``for(auto & i : userContainer){...}``) see *C++ Standard 6.5.4/1* can be used with user defined types. +Equally specializations of ``std::swap`` and other standard library function templates can be defined to extend those with support for user types. One Problem with function specialization is, that only full specializations are allowed. A partial function template specialization is not allowed by the standard. Another problem can emerge due to users carelessly overloading the template functions instead of specializing them. Mixing function overloading and function template specialization on the same base template function can result in unexpected results. The reasons and effects of this are described more closely in an article from H. Sutter (currently convener of the ISO C++ committee) called *Sutter's Mill: Why Not Specialize Function Templates?* in the *C/C++ Users Journal* in July 2001. - + +.. seealso:: + `different way `_ The solution given in the article is to provide *"a single function template that should never be specialized or overloaded"*. This function simply forwards its arguments *"to a class template containing a static function with the same signature"*. This template class can fully or partially be specialized without affecting overload resolution. The way the *alpaka* library implements this is by not using the C++ inherent object orientation but lifting those abstractions to a higher level. -Instead of using a non-extensible`class`/`struct` for defining the interface, a namespace is utilized. +Instead of using a non-extensible``class``/``struct`` for defining the interface, a namespace is utilized. In place of abstract virtual member functions of the interface, *alpaka* defines free functions within those namespaces. All those functions are templates allowing the user to call them with arbitrary self defined types and not only those inheriting from a special interface type. -Unlike member functions, they have no implicit `this` pointer, so the object instance has to be explicitly given as a parameter. +Unlike member functions, they have no implicit ``this`` pointer, so the object instance has to be explicitly given as a parameter. Overriding the abstract virtual interface methods is replaced by the specialization of a template type that is defined for each such namespace function. A concept is completely implemented by specializing the predefined template types. This allows to extend and fine-tune the implementation non-intrusively. -For example, the corresponding pitch and memory pinning template types can be specialized for `std::vector`. -After doing this, the `std::vector` can be used everywhere a buffer is accepted as argument throughout the whole *alpaka* library without ever touching its definition. +For example, the corresponding pitch and memory pinning template types can be specialized for ``std::vector``. +After doing this, the ``std::vector`` can be used everywhere a buffer is accepted as argument throughout the whole *alpaka* library without ever touching its definition. A simple function allowing arbitrary tasks to be enqueued into a queue can be implemented in the way shown in the following code. -The `TSfinae` template parameter will be explained in a [following section](#Template-Specialization-Selection-on-Arbitrary-Conditions). - -```C++ -namespace queue -{ - template< - typename TQueue, - typename TTask, - typename TSfinae = void> - struct Enqueue; - - template< - typename TQueue, - typename TTask> - ALPAKA_FN_HOST auto enqueue( - TQueue & queue, - TTask & task) - -> void - { - Enqueue< - TQueue, - TTask> - ::enqueue( - queue, - task); - } -} -``` - -A user who wants his queue type to be used with this `enqueue` function has to specialize the `Enqueue` template struct. -This can be either done partially by only replacing the `TQueue` template parameter and accepting arbitrary tasks or by fully specializing and replacing both `TQueue` and `TTask`. This gives the user complete freedom of choice. -The example given in the following code shows this by specializing the `Enqueue` type for a user queue type `UserQueue` and arbitrary tasks. - -```C++ -struct UserQueue{}; - -namespace queue -{ - // partial specialization - template< - typename TTask> - struct Enqueue< - UserQueue - TTask> - { - ALPAKA_FN_HOST static auto enqueue( - UserQueue & queue, - TTask & task) - -> void - { - //... - } - }; -} -``` - -In addition the subsequent code shows a full specialization of the `Enqueue` type for a given `UserQueue` and a `UserTask`. - -```C++ -struct UserQueue{}; -struct UserTask{}; - -namespace queue -{ - // full specialization - template<> - struct Enqueue< - UserQueue - UserTask> - { - ALPAKA_FN_HOST static auto enqueue( - UserQueue & queue, - UserTask & task) - -> void - { - //... - } - }; -} -``` - -When the `enqueue` function template is called with an instance of `UserQueue`, the most specialized version of the `Enqueue` template is selected depending on the type of the task `TTask` it is called with. - -A type can model the queue concept completely by defining specializations for `alpaka::queue::Enqueue` and `alpaka::queue::Empty`. -This functionality can be accessed by the corresponding `alpaka::queue::enqueue` and `alpaka::queue::empty` template functions. +The ``TSfinae`` template parameter will be explained in a `following section <#Template-Specialization-Selection-on-Arbitrary-Conditions>`_. + +.. code-block:: + + namespace queue + { + template< + typename TQueue, + typename TTask, + typename TSfinae = void> + struct Enqueue; + + template< + typename TQueue, + typename TTask> + ALPAKA_FN_HOST auto enqueue( + TQueue & queue, + TTask & task) + -> void + { + Enqueue< + TQueue, + TTask> + ::enqueue( + queue, + task); + } + } + +A user who wants his queue type to be used with this ``enqueue`` function has to specialize the ``Enqueue`` template struct. +This can be either done partially by only replacing the ``TQueue`` template parameter and accepting arbitrary tasks or by fully specializing and replacing both ``TQueue`` and ``TTask``. This gives the user complete freedom of choice. +The example given in the following code shows this by specializing the ``Enqueue`` type for a user queue type ``UserQueue`` and arbitrary tasks. + +.. code-block:: + + struct UserQueue{}; + + namespace queue + { + // partial specialization + template< + typename TTask> + struct Enqueue< + UserQueue + TTask> + { + ALPAKA_FN_HOST static auto enqueue( + UserQueue & queue, + TTask & task) + -> void + { + //... + } + }; + } + +In addition the subsequent code shows a full specialization of the ``Enqueue`` type for a given ``UserQueue`` and a ``UserTask``. + +.. code-block:: + + struct UserQueue{}; + struct UserTask{}; + + namespace queue + { + // full specialization + template<> + struct Enqueue< + UserQueue + UserTask> + { + ALPAKA_FN_HOST static auto enqueue( + UserQueue & queue, + UserTask & task) + -> void + { + //... + } + }; + } + +When the ``enqueue`` function template is called with an instance of ``UserQueue``, the most specialized version of the ``Enqueue`` template is selected depending on the type of the task ``TTask`` it is called with. + +A type can model the queue concept completely by defining specializations for ``alpaka::queue::Enqueue`` and ``alpaka::queue::Empty``. +This functionality can be accessed by the corresponding ``alpaka::queue::enqueue`` and ``alpaka::queue::empty`` template functions. Currently there is no native language support for describing and checking concepts in C++ at compile time. -A study group (SG8) is working on the ISO [specification for conecpts](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf) and compiler forks implementing them do exist. -For usage in current C++ there are libraries like [*Boost.ConceptCheck*](http://www.boost.org/doc/libs/1_58_0/libs/concept_check/concept_check.htm) which try to emulate requirement checking of concept types. +A study group (SG8) is working on the ISO `specification for conecpts `_ and compiler forks implementing them do exist. +For usage in current C++ there are libraries like `Boost.ConceptCheck `_ which try to emulate requirement checking of concept types. Those libraries often exploit the preprocessor and require non-trivial changes to the function declaration syntax. Therefore the *alpaka* library does not currently make use of *Boost.ConceptCheck*. Neither does it facilitate the proposed concept specification due to its dependency on non-standard compilers. @@ -199,44 +202,44 @@ Template Specialization Selection on Arbitrary Conditions Basic template specialization only allows for a selection of the most specialized version where all explicitly stated types have to be matched identically. It is not possible to enable or disable a specialization based on arbitrary compile time expressions depending on the parameter types. -To allow such conditions, *alpaka* adds a defaulted and unused `TSfinae` template parameter to all declarations of the implementation template structs. -This was shown using the example of the `Enqueue` template type. +To allow such conditions, *alpaka* adds a defaulted and unused ``TSfinae`` template parameter to all declarations of the implementation template structs. +This was shown using the example of the ``Enqueue`` template type. The C++ technique called SFINAE, an acronym for *Substitution failure is not an error* allows to disable arbitrary specializations depending on compile time conditions. Specializations where the substitution of the parameter types by the deduced types would result in invalid code will not result in a compile error, but will simply be omitted. -An example in the context of the `Enqueue` template type is shown in the following code. - -```C++ -struct UserQueue{}; - -namespace queue -{ - template< - typename TQueue, - typename TTask> - struct Enqueue< - TQueue - TTask, - std::enable_if_t< - std::is_base_of::value - && (TTask::TaskId == 1u) - >> - { - ALPAKA_FN_HOST static auto enqueue( - TQueue & queue, - TTask & task) - -> void - { - //... - } - }; -} -``` - -The `Enqueue` specialization shown here does not require any direct type match for the `TQueue` or the `TTask` template parameter. -It will be used in all contexts where `TQueue` has inherited from `UserQueue` and where the `TTask` has a static const integral member value `TaskId` that equals one. -If the `TTask` type does not have a `TaskId` member, this code would be invalid and the substitution would fail. +An example in the context of the ``Enqueue`` template type is shown in the following code. + +.. code-block:: + + struct UserQueue{}; + + namespace queue + { + template< + typename TQueue, + typename TTask> + struct Enqueue< + TQueue + TTask, + std::enable_if_t< + std::is_base_of::value + && (TTask::TaskId == 1u) + >> + { + ALPAKA_FN_HOST static auto enqueue( + TQueue & queue, + TTask & task) + -> void + { + //... + } + }; + } + +The ``Enqueue`` specialization shown here does not require any direct type match for the ``TQueue`` or the ``TTask`` template parameter. +It will be used in all contexts where ``TQueue`` has inherited from ``UserQueue`` and where the ``TTask`` has a static const integral member value ``TaskId`` that equals one. +If the ``TTask`` type does not have a ``TaskId`` member, this code would be invalid and the substitution would fail. However, due to SFINAE, this would not result in a compiler error but rather only in omitting this specialization. -The `std::enable_if` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false. +The ``std::enable_if`` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false. Therefore it can be used to disable specializations depending on arbitrary boolean conditions. -It is utilized in the case where the `TaskId` member is unequal one or the `TQueue` does not inherit from `UserQueue`. -In this cirumstances, the condition itself results in valid code but because it evaluates to false, the `std::enable_if` specialization results in invalid code and the whole `Enqueue` template specialization gets omitted. +It is utilized in the case where the ``TaskId`` member is unequal one or the ``TQueue`` does not inherit from ``UserQueue``. +In this cirumstances, the condition itself results in valid code but because it evaluates to false, the ``std::enable_if`` specialization results in invalid code and the whole ``Enqueue`` template specialization gets omitted. diff --git a/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst b/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst new file mode 100644 index 0000000000..139af45540 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst @@ -0,0 +1,105 @@ +Sphinx +====== + +.. sectionauthor:: Axel Huebl, alpaka-group + +In the following section we explain how to contribute to this documentation. + +If you are reading the `HTML version `_ and want to improve or correct existing pages, check the "*Edit on GitHub*" link on the right upper corner of each document. + +Alternatively, go to `docs/source` in our source code and follow the directory structure of `reStructuredText`_ (``.rst``) files there. +For intrusive changes, like structural changes to chapters, please open an issue to discuss them beforehand. + +.. _reStructuredText: https://www.sphinx-doc.org/en/stable/rest.html + +Build Locally +------------- + +This document is build based on free open-source software, namely `Sphinx`_, `Doxygen`_ (C++ APIs as XML), `Breathe`_ (to include doxygen XML in Sphinx) and `rst2pdf`_ (render the cheat sheet). +A web-version is hosted on `ReadTheDocs`_. + +.. _Sphinx: https://github.com/sphinx-doc/sphinx +.. _Doxygen: http://doxygen.org +.. _Breathe: https://github.com/michaeljones/breathe +.. _rst2pdf: https://rst2pdf.org/ +.. _ReadTheDocs: https://readthedocs.org/ + +The following requirements need to be installed (once) to build our documentation successfully: + +.. code-block:: bash + + cd docs/ + + # doxygen is not shipped via pip, install it externally, + # from the homepage, your package manager, conda, etc. + # example: + sudo apt-get install doxygen + # sudo pacman -S doxygen + + # python tools & style theme + pip install -r requirements.txt # --user + + +With all documentation-related software successfully installed, just run the following commands to build your docs locally. +Please check your documentation build is successful and renders as you expected before opening a pull request! + +.. code-block:: bash + + # skip this if you are still in docs/ + cd docs/ + + # parse the C++ API documentation (default: xml format) + doxygen Doxyfile + + # render the cheatsheet.pdf + rst2pdf -s cheatsheet/cheatsheet.style source/basic/cheatsheet.rst -o cheatsheet/cheatsheet.pdf + + # render the '.rst' files with sphinx + make html + + # open it, e.g. with firefox :) + firefox build/html/index.html + + # now again for the pdf :) + make latexpdf + + # open it, e.g. with okular + build/latex/alpaka.pdf + +.. hint:: + + Run `make clean` to clean the build directory before executing actual make. This is necessary to reflect changes outside the rst files. + +.. hint:: + + There is a checklinks target to check links in the rst files on availability: + + .. code-block:: bash + + # check existence of links + # cd docs/ + make checklinks + +.. hint:: + + The Doxyfile for doxygen is configured to output in xml format per default. + Another targets can be configured in the Doxyfile. The final documentations are stored in ``docs/doxygen/``. + + .. code-block:: bash + + # run in docs/doxygen/ + sed -i -E 's/(GENERATE_HTML\s*=\s*)NO/\1YES/g' Doxyfile + +readthedocs +----------- + +To maintain or import a github project an account on `ReadTheDocs`_ is required. +Further instructions can be found on `readthedocs on github `_ and `readthedocs import guide `_. + +Useful Links +------------ + + * `A primer on writing reStructuredText files for sphinx `_ + * `Why You Shouldn't Use "Markdown" for Documentation `_ + * `reStructuredText vs. Markdown `_ + * `Markdown Limitations in Sphinx `_ diff --git a/thirdParty/cupla/alpaka/docs/source/dev/style.rst b/thirdParty/cupla/alpaka/docs/source/dev/style.rst new file mode 100644 index 0000000000..fe6d4dfb07 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/dev/style.rst @@ -0,0 +1,156 @@ +.. highlight:: cpp + +Coding Guidelines +================== + +.. attention:: + The Coding Guidelines are currently revised + +Naming +------ + +* Types are always in PascalCase (KernelExecCuda, BufT, ...) and singular. +* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else. +* Namespaces are always in lowercase and singular is preferred. +* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable. + + +Types +----- + +* Always use integral types with known width (``int32_t``, ``uint64_t``, ...). + Never use ``int``, ``unsigned long``, etc. + + +Type Qualifiers +--------------- + +The order of type qualifiers should be: +``Type const * const`` for a const pointer to a const Type. +``Type const &`` for a reference to a const Type. + +The reason is that types can be read from right to left correctly without jumping back and forth. +``const Type * const`` and ``const Type &`` would require jumping in either way to read them correctly. + + +Variables +--------- + +* Variables should always be initialized on construction because this can produce hard to debug errors. + This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style. +* Variables should (nearly) always be ``const`` to make the code more easy to understand. + This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM. + This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers. +* Variable definitions should be differentiated from assignments by using either ``(...)`` or ``{...}`` but never ``=`` for definitions. + Use ``uint32_t const iUsageOfThisVariable(42);`` instead of ``uint32_t const iUsageOfThisVariable = 42;`` + + +Comments +-------- + +* Always use C++-Style comments ``//`` +* For types use + ``//#############################################################################`` + to start the comment block. +* For functions use + ``//-----------------------------------------------------------------------------`` + to start the comment block. +* Never write comments for closing braces (namespaces, classes, etc ...) + + +Braces +------ + +* Braces (opening and closing) for classes, structs, functions, namespaces, etc. appear on a new line. Exception: If the function or class body is empty, the opening and closing braces are on the same (next) line. +* Only braces for variable initialization can appear in-line. + + +Indentation +----------- + +* Always indent everything by *one level* (namespace body, class members, function body, ...) +* Do not use more indentation e.g. to align function parameters. + + +Spaces +------ + +* Trailing white-spaces are forbidden. +* There is no space between keywords (if, for, ...) and the opening parenthesis. +* There is no space after the opening ``(`` or ``<`` and before the closing ``)`` or ``>``. +* There is a space before and after binary operators (=, \*, +, ...) +* There is no space after the unary operators !, ~, ... + + +Functions +--------- + +* Always use the trailing return type syntax with the return type on a new line even if the return type is void: + +.. code-block:: + + auto func() + -> bool + +* This makes it easier to see the return type because it is on its own line. +* This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions. +* Each function parameter is on a new indented line: + +.. code-block:: + + auto func( + float f1, + float f2) + -> bool + { + return true + } + +.. code-block:: + + func( + 1.0f, + 2.0f); + +* Makes it easier to see how many parameters there are and which position they have. + + +Templates +--------- + +* Template parameters are prefixed with ``T`` to differentiate them from class or function local typedefs. +* Each template parameter is on a new indented line: + +.. code-block:: c++ + + template< + typename TParam, + typename TArgs...> + auto func() + -> bool + +* Makes it easier to see how many template parameters there are and which position they have. +* Always use ``typename`` for template parameters. There is NO difference to class and typename matches the intent better. + + +Traits +------ + +* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization: + +.. code-block:: + + template< + typename T, + typename TSfinae = void> + struct GetOffsets; + +* Template trait aliases always end with a ``T`` e.g. ``BufT`` while the corresponding trait ends with ``Type`` e.g. ``BufType`` +* Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: ``sin(){...}`` and ``Sin{sin(){...}};`` + +Includes +-------- + +* The order of includes is from the most specialized header to the most general one. + This order helps to find missing includes in more specialized headers because the general ones are always included afterwards. +* A comment with the types or functions included by a include file make it easier to find out why a special header is included. diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block.png b/thirdParty/cupla/alpaka/docs/source/images/block.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block.png rename to thirdParty/cupla/alpaka/docs/source/images/block.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block_scale.png b/thirdParty/cupla/alpaka/docs/source/images/block_scale.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block_scale.png rename to thirdParty/cupla/alpaka/docs/source/images/block_scale.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/element/element.png b/thirdParty/cupla/alpaka/docs/source/images/element.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/abstraction/element/element.png rename to thirdParty/cupla/alpaka/docs/source/images/element.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.png b/thirdParty/cupla/alpaka/docs/source/images/execution_domain.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.png rename to thirdParty/cupla/alpaka/docs/source/images/execution_domain.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.svg b/thirdParty/cupla/alpaka/docs/source/images/execution_domain.svg similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.svg rename to thirdParty/cupla/alpaka/docs/source/images/execution_domain.svg diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.png b/thirdParty/cupla/alpaka/docs/source/images/structure.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.png rename to thirdParty/cupla/alpaka/docs/source/images/structure.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.svg b/thirdParty/cupla/alpaka/docs/source/images/structure.svg similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.svg rename to thirdParty/cupla/alpaka/docs/source/images/structure.svg diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.png b/thirdParty/cupla/alpaka/docs/source/images/structure_assoc.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.png rename to thirdParty/cupla/alpaka/docs/source/images/structure_assoc.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg b/thirdParty/cupla/alpaka/docs/source/images/structure_assoc.svg similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg rename to thirdParty/cupla/alpaka/docs/source/images/structure_assoc.svg diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/thread/thread.png b/thirdParty/cupla/alpaka/docs/source/images/thread.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/abstraction/thread/thread.png rename to thirdParty/cupla/alpaka/docs/source/images/thread.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/warp/warp.png b/thirdParty/cupla/alpaka/docs/source/images/warp.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/abstraction/warp/warp.png rename to thirdParty/cupla/alpaka/docs/source/images/warp.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png b/thirdParty/cupla/alpaka/docs/source/images/x86_cpu.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png rename to thirdParty/cupla/alpaka/docs/source/images/x86_cpu.png diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png b/thirdParty/cupla/alpaka/docs/source/images/x86_cpu_mapping.png similarity index 100% rename from thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png rename to thirdParty/cupla/alpaka/docs/source/images/x86_cpu_mapping.png diff --git a/thirdParty/cupla/alpaka/docs/source/index.rst b/thirdParty/cupla/alpaka/docs/source/index.rst new file mode 100644 index 0000000000..0550a8f57b --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/index.rst @@ -0,0 +1,73 @@ +:orphan: + +.. only:: html + + .. image:: ../logo/alpaka.svg + +.. only:: latex + + .. image:: ../logo/alpaka.pdf + +*alpaka - An Abstraction Library for Parallel Kernel Acceleration* + +The alpaka library is a header-only C++14 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism. + +.. CAUTION:: + The readthedocs pages are work in progress and contain outdated sections. + +alpaka - How to Read This Document +---------------------------------- + +Generally, **follow the manual pages in-order** to get started. +Individual chapters are based on the information of the chapters before. + +.. only:: html + + The online version of this document is **versioned** and shows by default the manual of the last *stable* version of alpaka. + If you are looking for the latest *development* version, `click here `_. + +.. note:: + + Are you looking for our latest Doxygen docs for the API? + + - See https://alpaka-group.github.io/alpaka/ + + +.. toctree:: + :caption: Basic + :maxdepth: 1 + + basic/intro.rst + basic/install.rst + basic/abstraction.rst + basic/library.rst + basic/cheatsheet.rst + +.. toctree:: + :caption: Advanced + :maxdepth: 1 + + advanced/rationale.rst + advanced/mapping.rst + +.. toctree:: + :caption: Extra Info + :maxdepth: 1 + + info/similar_projects.rst + +.. toctree:: + :caption: Development + :maxdepth: 1 + + dev/backends.rst + dev/details.rst + dev/style + dev/sphinx + API Reference + +Indices and Tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst b/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst new file mode 100644 index 0000000000..4ba94d5708 --- /dev/null +++ b/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst @@ -0,0 +1,45 @@ +Similar Projects +================ + +`KOKKOS `_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. seealso:: + * https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf + * https://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf + * https://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103\_Towards-Performance-Portable-Applications-Kokkos.pdf + * https://dx.doi.org/10.3233/SPR-2012-0343 + +Kokkos provides an abstract interface for portable, performant shared memory-programming. +It is a C++ library that offers ``parallel_for``, ``parallel_reduce`` and similar functions +for describing the pattern of the parallel tasks. The execution policy determines how the +threads are executed. For example, this influences the sizes of blocks of threads or if +static or dynamic scheduling should be used. The library abstracts the kernel as a function +object that can not have any user defined parameters for its ``operator()``. Arguments have +to be stored in members of the function object coupling algorithm and data together. *KOKKOS* +provides both, abstractions for parallel execution of code and data management. +Multidimensional arrays with a neutral indexing and an architecture dependent layout are +available, which can be used, for example, to abstract the underlying hardwares preferred +memory access scheme that could be row-major, column-major or even blocked. + + +`Thrust `_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Thrust is a parallel algorithms library resembling the C++ Standard Template Library (STL). +It allows to select either the *CUDA*, *TBB* or *OpenMP* back-end at make-time. Because it is +based on generic ``host_vector`` and ``device_vector`` container objects, it is tightly coupling +the data structure and the parallelization strategy. There exist many similar libraries such +as `ArrayFire `_ (*CUDA*, *OpenCL*, native C++), +`VexCL `_ (*OpenCL*, *CUDA*), +`ViennaCL `_ (*OpenCL*, *CUDA*, *OpenMP*) and +`hemi `_ (*CUDA*, native C++). + +.. seealso:: + * Phalanx + See `here `_ + It is very similar to *alpaka* in the way it abstracts the accelerators. + C++ Interface provides CUDA, OpenMP, and GASNet back-ends + * Aura + * Intel TBB + * U\PC++ diff --git a/thirdParty/cupla/alpaka/example/CMakeLists.txt b/thirdParty/cupla/alpaka/example/CMakeLists.txt index 6b330464d1..dd326b9326 100644 --- a/thirdParty/cupla/alpaka/example/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2015-2020 Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -28,7 +28,8 @@ project("alpakaExamples") # Add subdirectories. ################################################################################ -add_subdirectory("bufferCopy") +add_subdirectory("bufferCopy/") +add_subdirectory("heatEquation/") add_subdirectory("helloWorld/") add_subdirectory("helloWorldLambda/") add_subdirectory("reduce/") diff --git a/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt b/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt index 287aa3c86a..b22eceaff2 100644 --- a/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -34,9 +34,15 @@ project(${_TARGET_NAME}) # Find alpaka. if(NOT TARGET alpaka::alpaka) - set(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of alpakaConfig.cmake") - list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}") - find_package(alpaka REQUIRED) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() endif() #------------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp b/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp index 794a18d46e..8f65b9e294 100644 --- a/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp +++ b/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -16,6 +16,7 @@ */ #include +#include #include #include @@ -143,6 +144,7 @@ auto main() // It is possible to choose from a set of accelerators // that are defined in the alpaka::acc namespace e.g.: // - AccGpuCudaRt + // - AccGpuHipRt // - AccCpuThreads // - AccCpuFibers // - AccCpuOmp2Threads @@ -150,14 +152,14 @@ auto main() // - AccCpuOmp4 // - AccCpuTbbBlocks // - AccCpuSerial - using Acc = alpaka::acc::AccCpuSerial; + // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::example::ExampleDefaultAcc; + std::cout << "Using alpaka accelerator: " << alpaka::acc::getAccName() << std::endl; // Defines the synchronization behavior of a queue // // choose between Blocking and NonBlocking using AccQueueProperty = alpaka::queue::Blocking; using DevQueue = alpaka::queue::Queue; - using DevAcc = alpaka::dev::Dev; - using PltfAcc = alpaka::pltf::Pltf; // Define the device accelerator // @@ -175,33 +177,32 @@ auto main() // choose between Blocking and NonBlocking using HostQueueProperty = alpaka::queue::Blocking; using HostQueue = alpaka::queue::Queue; - using DevHost = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; // Select devices - DevAcc const devAcc(alpaka::pltf::getDevByIdx(0u)); - DevHost const devHost(alpaka::pltf::getDevByIdx(0u)); + auto const devAcc = alpaka::pltf::getDevByIdx(0u); + auto const devHost = alpaka::pltf::getDevByIdx(0u); // Create queues DevQueue devQueue(devAcc); HostQueue hostQueue(devHost); - // Define the work division + // Define the work division for kernels to be run on devAcc and devHost using Vec = alpaka::vec::Vec; Vec const elementsPerThread(Vec::all(static_cast(1))); - Vec const threadsPerBlock(Vec::all(static_cast(1))); - - Vec const blocksPerGrid( - static_cast(4), - static_cast(8), - static_cast(16)); - + Vec const threadsPerGrid(Vec::all(static_cast(10))); using WorkDiv = alpaka::workdiv::WorkDivMembers; - WorkDiv const workdiv( - blocksPerGrid, - threadsPerBlock, - elementsPerThread); - + WorkDiv const devWorkDiv = alpaka::workdiv::getValidWorkDiv( + devAcc, + threadsPerGrid, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); + WorkDiv const hostWorkDiv = alpaka::workdiv::getValidWorkDiv( + devHost, + threadsPerGrid, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); // Create host and device buffers // @@ -220,19 +221,19 @@ auto main() // // The `alloc` method returns a reference counted buffer handle. // When the last such handle is destroyed, the memory is freed automatically. - using BufHost = alpaka::mem::buf::Buf; + using BufHost = alpaka::mem::buf::Buf; BufHost hostBuffer(alpaka::mem::buf::alloc(devHost, extents)); // You can also use already allocated memory and wrap it within a view (irrespective of the device type). // The view does not own the underlying memory. So you have to make sure that // the view does not outlive its underlying memory. std::array plainBuffer; - using ViewHost = alpaka::mem::view::ViewPlainPtr; + using ViewHost = alpaka::mem::view::ViewPlainPtr; ViewHost hostViewPlainPtr(plainBuffer.data(), devHost, extents); // Allocate accelerator memory buffers // // The interface to allocate a buffer is the same on the host and on the device. - using BufAcc = alpaka::mem::buf::Buf; + using BufAcc = alpaka::mem::buf::Buf; BufAcc deviceBuffer1(alpaka::mem::buf::alloc(devAcc, extents)); BufAcc deviceBuffer2(alpaka::mem::buf::alloc(devAcc, extents)); @@ -263,7 +264,7 @@ auto main() alpaka::kernel::exec( hostQueue, - workdiv, + hostWorkDiv, fillBufferKernel, pHostViewPlainPtr, // 1st kernel argument extents); // 2nd kernel argument @@ -303,7 +304,7 @@ auto main() TestBufferKernel testBufferKernel; alpaka::kernel::exec( devQueue, - workdiv, + devWorkDiv, testBufferKernel, pDeviceBuffer1, // 1st kernel argument extents, // 2nd kernel argument @@ -311,7 +312,7 @@ auto main() alpaka::kernel::exec( devQueue, - workdiv, + devWorkDiv, testBufferKernel, pDeviceBuffer2, // 1st kernel argument extents, // 2nd kernel argument @@ -331,7 +332,7 @@ auto main() PrintBufferKernel printBufferKernel; alpaka::kernel::exec( devQueue, - workdiv, + devWorkDiv, printBufferKernel, pDeviceBuffer1, // 1st kernel argument extents, // 2nd kernel argument @@ -341,7 +342,7 @@ auto main() alpaka::kernel::exec( devQueue, - workdiv, + devWorkDiv, printBufferKernel, pDeviceBuffer2, // 1st kernel argument extents, // 2nd kernel argument @@ -351,7 +352,7 @@ auto main() alpaka::kernel::exec( hostQueue, - workdiv, + hostWorkDiv, printBufferKernel, pHostBuffer, // 1st kernel argument extents, // 2nd kernel argument @@ -361,7 +362,7 @@ auto main() alpaka::kernel::exec( hostQueue, - workdiv, + hostWorkDiv, printBufferKernel, pHostViewPlainPtr, // 1st kernel argument extents, // 2nd kernel argument diff --git a/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt b/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt new file mode 100644 index 0000000000..525bb3810a --- /dev/null +++ b/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt @@ -0,0 +1,61 @@ +# +# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan +# +# This file exemplifies usage of alpaka. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.15) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME heatEquation) + +project(${_TARGET_NAME}) + + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/heatEquation.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp b/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp new file mode 100644 index 0000000000..2550cd267f --- /dev/null +++ b/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp @@ -0,0 +1,303 @@ +/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude, + * Sergei Bastrakov + * + * This file exemplifies usage of alpaka. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR + * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include + + +//############################################################################# +//! alpaka version of explicit finite-difference 1d heat equation solver +//! +//! Solving equation u_t(x, t) = u_xx(x, t) using a simple explicit scheme with +//! forward difference in t and second-order central difference in x +//! +//! \param uCurrBuf grid values of u for each x and the current value of t: +//! u(x, t) | t = t_current +//! \param uNext resulting grid values of u for each x and the next value of t: +//! u(x, t) | t = t_current + dt +//! \param extent number of grid nodes in x (eq. to numNodesX) +//! \param dx step in x +//! \param dt step in t + +struct HeatEquationKernel +{ + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + double const * const uCurrBuf, + double * const uNextBuf, + uint32_t const extent, + double const dx, + double const dt) const + -> void + { + // Each kernel executes one element + double const r = dt / ( dx * dx ); + int idx = + alpaka::idx::getIdx< + alpaka::Grid, + alpaka::Threads + >( acc )[0]; + if( idx > 0 && idx < extent - 1u ) + { + uNextBuf[idx] = + uCurrBuf[idx] * ( 1.0 - 2.0 * r ) + + uCurrBuf[idx - 1] * r + + uCurrBuf[idx + 1] * r; + } + } +}; + + +//! Exact solution to the test problem +//! u_t(x, t) = u_xx(x, t), x in [0, 1], t in [0, T] +//! u(0, t) = u(1, t) = 0 +//! u(x, 0) = sin(pi * x) +//! +//! \param x value of x +//! \param t value of t +double exactSolution( + double const x, + double const t +) +{ + constexpr double pi = 3.14159265358979323846; + return std::exp( -pi * pi * t ) * std::sin( pi * x ); +} + + +//! Each kernel computes the next step for one point. +//! Therefore the number of threads should be equal to numNodesX. +//! Every time step the kernel will be executed numNodesX-times +//! After every step the curr-buffer will be set to the calculated values +//! from the next-buffer. +auto main( ) -> int +{ +#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + return EXIT_SUCCESS; +#else + // Parameters (a user is supposed to change numNodesX, numTimeSteps) + uint32_t const numNodesX = 1000; + uint32_t const numTimeSteps = 10000; + double const tMax = 0.001; + // x in [0, 1], t in [0, tMax] + double const dx = 1.0 / static_cast< double >( numNodesX - 1 ); + double const dt = tMax / static_cast< double >( numTimeSteps - 1 ); + + // Check the stability condition + double const r = dt / ( dx * dx ); + if( r > 0.5 ) + { + std::cerr << "Stability condition check failed: dt/dx^2 = " << r + << ", it is required to be <= 0.5\n"; + return EXIT_FAILURE; + } + + // Set Dim and Idx type + using Dim = alpaka::dim::DimInt< 1u >; + using Idx = uint32_t; + + // Select accelerator-types for host and device + // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::example::ExampleDefaultAcc; + std::cout << "Using alpaka accelerator: " << alpaka::acc::getAccName() << std::endl; + + using DevHost = alpaka::dev::DevCpu; + + // Select specific devices + auto const devAcc = alpaka::pltf::getDevByIdx< Acc >( 0u ); + auto const devHost = alpaka::pltf::getDevByIdx< DevHost >( 0u ); + + // Get valid workdiv for the given problem + uint32_t elemPerThread = 1; + alpaka::vec::Vec< + Dim, + Idx + > const extent { numNodesX }; + using WorkDiv = alpaka::workdiv::WorkDivMembers< + Dim, + Idx + >; + auto workdiv = WorkDiv{ + alpaka::workdiv::getValidWorkDiv< Acc >( + devAcc, + extent, + elemPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted + ) + }; + + // Select queue + using QueueProperty = alpaka::queue::Blocking; + using QueueAcc = alpaka::queue::Queue< + Acc, + QueueProperty + >; + QueueAcc queue { devAcc }; + + // Initialize host-buffer + using BufHost = alpaka::mem::buf::Buf< + DevHost, + double, + Dim, + Idx + >; + // This buffer holds the calculated values + auto uNextBufHost = BufHost{ + alpaka::mem::buf::alloc< + double, + Idx + >( + devHost, + extent + ) + }; + // This buffer will hold the current values (used for the next step) + auto uCurrBufHost = BufHost{ + alpaka::mem::buf::alloc< + double, + Idx + >( + devHost, + extent + ) + }; + + double * const pCurrHost = alpaka::mem::view::getPtrNative( uCurrBufHost ); + double * const pNextHost = alpaka::mem::view::getPtrNative( uNextBufHost ); + + // Accelerator buffer + using BufAcc = alpaka::mem::buf::Buf< + Acc, + double, + Dim, + Idx + >; + auto uNextBufAcc = BufAcc{ + alpaka::mem::buf::alloc< + double, + Idx + >( + devAcc, + extent + ) + }; + auto uCurrBufAcc = BufAcc{ + alpaka::mem::buf::alloc< + double, + Idx + >( + devAcc, + extent + ) + }; + + double * pCurrAcc = alpaka::mem::view::getPtrNative( uCurrBufAcc ); + double * pNextAcc = alpaka::mem::view::getPtrNative( uNextBufAcc ); + + // Apply initial conditions for the test problem + for( uint32_t i = 0; i < numNodesX; i++ ) + { + pCurrHost[i] = exactSolution( + i * dx, + 0.0 + ); + } + + HeatEquationKernel kernel; + + // Copy host -> device + alpaka::mem::view::copy( + queue, + uCurrBufAcc, + uCurrBufHost, + extent + ); + // Copy to the buffer for next as well to have boundary values set + alpaka::mem::view::copy( + queue, + uNextBufAcc, + uCurrBufAcc, + extent + ); + alpaka::wait::wait( queue ); + + for( uint32_t step = 0; step < numTimeSteps; step++ ) + { + // Compute next values + alpaka::kernel::exec< Acc >( + queue, + workdiv, + kernel, + pCurrAcc, + pNextAcc, + numNodesX, + dx, + dt + ); + + // We assume the boundary conditions are constant and so these values + // do not need to be updated. + // So we just swap next to curr (shallow copy) + std::swap( + pCurrAcc, + pNextAcc + ); + } + + // Copy device -> host + alpaka::mem::view::copy( + queue, + uNextBufHost, + uNextBufAcc, + extent + ); + alpaka::wait::wait( queue ); + + // Calculate error + double maxError = 0.0; + for( uint32_t i = 0; i < numNodesX; i++ ) + { + auto const error = std::abs( + pNextHost[i] - exactSolution( i * dx, tMax ) + ); + maxError = std::max( maxError, error ); + } + + double const errorThreshold = 1e-5; + bool resultCorrect = ( maxError < errorThreshold ); + if( resultCorrect ) + { + std::cout << "Execution results correct!" << std::endl; + return EXIT_SUCCESS; + } + else + { + std::cout << "Execution results incorrect: error = " << maxError + << " (the grid resolution may be too low)" << std::endl; + return EXIT_FAILURE; + } +#endif +} diff --git a/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt b/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt index 7ff6067cd5..1456701fce 100644 --- a/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -34,9 +34,15 @@ project(${_TARGET_NAME}) # Find alpaka. if(NOT TARGET alpaka::alpaka) - set(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of alpakaConfig.cmake") - list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}") - find_package(alpaka REQUIRED) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() endif() #------------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp b/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp index 34e4372a86..891780f0e1 100644 --- a/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp +++ b/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -16,6 +16,7 @@ */ #include +#include #include @@ -87,6 +88,7 @@ auto main() // It is possible to choose from a set of accelerators // that are defined in the alpaka::acc namespace e.g.: // - AccGpuCudaRt + // - AccGpuHipRt // - AccCpuThreads // - AccCpuFibers // - AccCpuOmp2Threads @@ -102,16 +104,15 @@ auto main() // automatically. // By exchanging the Acc and Queue types you can select where to execute the kernel. - using Acc = alpaka::acc::AccCpuSerial; + // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::example::ExampleDefaultAcc; + std::cout << "Using alpaka accelerator: " << alpaka::acc::getAccName() << std::endl; // Defines the synchronization behavior of a queue // // choose between Blocking and NonBlocking using QueueProperty = alpaka::queue::Blocking; using Queue = alpaka::queue::Queue; - using Dev = alpaka::dev::Dev; - using Pltf = alpaka::pltf::Pltf; - // Select a device // @@ -121,7 +122,7 @@ auto main() // by id (0 to the number of devices minus 1) or you // can also retrieve all devices in a vector (getDevs()). // In this example the first devices is choosen. - Dev const devAcc(alpaka::pltf::getDevByIdx(0u)); + auto const devAcc = alpaka::pltf::getDevByIdx(0u); // Create a queue on the device // @@ -165,18 +166,14 @@ auto main() // vector processing unit. using Vec = alpaka::vec::Vec; Vec const elementsPerThread(Vec::all(static_cast(1))); - Vec const threadsPerBlock(Vec::all(static_cast(1))); - Vec const blocksPerGrid( - static_cast(4), - static_cast(8), - static_cast(16)); - + Vec const threadsPerGrid(Vec::all(static_cast(8))); using WorkDiv = alpaka::workdiv::WorkDivMembers; - WorkDiv const workDiv( - blocksPerGrid, - threadsPerBlock, - elementsPerThread); - + WorkDiv const workDiv = alpaka::workdiv::getValidWorkDiv( + devAcc, + threadsPerGrid, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); // Instantiate the kernel function object // diff --git a/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt b/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt index 11ec8457ad..e262e792e3 100644 --- a/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -34,9 +34,15 @@ project(${_TARGET_NAME}) # Find alpaka. if(NOT TARGET alpaka::alpaka) - set(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of alpakaConfig.cmake") - list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}") - find_package(alpaka REQUIRED) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() endif() #------------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp b/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp index af8f4a8ce9..ec692d5dac 100644 --- a/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp +++ b/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -16,6 +16,7 @@ */ #include +#include #include @@ -73,6 +74,7 @@ auto main() // It is possible to choose from a set of accelerators // that are defined in the alpaka::acc namespace e.g.: // - AccGpuCudaRt + // - AccGpuHipRt // - AccCpuThreads // - AccCpuFibers // - AccCpuOmp2Threads @@ -80,18 +82,18 @@ auto main() // - AccCpuOmp4 // - AccCpuTbbBlocks // - AccCpuSerial - using Acc = alpaka::acc::AccCpuSerial; + // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::example::ExampleDefaultAcc; + std::cout << "Using alpaka accelerator: " << alpaka::acc::getAccName() << std::endl; // Defines the synchronization behavior of a queue // // choose between Blocking and NonBlocking using QueueProperty = alpaka::queue::Blocking; using Queue = alpaka::queue::Queue; - using Dev = alpaka::dev::Dev; - using Pltf = alpaka::pltf::Pltf; // Select a device - Dev const devAcc(alpaka::pltf::getDevByIdx(0u)); + auto const devAcc = alpaka::pltf::getDevByIdx(0u); // Create a queue on the device Queue queue(devAcc); @@ -99,24 +101,21 @@ auto main() // Define the work division using Vec = alpaka::vec::Vec; Vec const elementsPerThread(Vec::all(static_cast(1))); - Vec const threadsPerBlock(Vec::all(static_cast(1))); - Vec const blocksPerGrid( - static_cast(1), - static_cast(2), - static_cast(4)); - + Vec const threadsPerGrid(Vec::all(static_cast(8))); using WorkDiv = alpaka::workdiv::WorkDivMembers; - WorkDiv const workDiv( - blocksPerGrid, - threadsPerBlock, - elementsPerThread); + WorkDiv const workDiv = alpaka::workdiv::getValidWorkDiv( + devAcc, + threadsPerGrid, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); const size_t nExclamationMarks = 10; // Run "Hello World" kernel with a lambda function // - // Alpaka is able to execute lambda functions (anonymous functions). - // Alpaka forces the lambda function to accept + // alpaka is able to execute lambda functions (anonymous functions). + // alpaka forces the lambda function to accept // the utilized accelerator as first argument. // All following arguments can be provided after // the lambda function declaration or be captured. diff --git a/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt b/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt index cc5799c830..9e7e3f1f7d 100644 --- a/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -34,9 +34,15 @@ project(${_TARGET_NAME}) # Find alpaka. if(NOT TARGET alpaka::alpaka) - set(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of alpakaConfig.cmake") - list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}") - find_package(alpaka REQUIRED) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() endif() #------------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp b/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp index b6cfe73024..a20de0b477 100644 --- a/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp +++ b/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Jonas Schenke * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -64,12 +64,6 @@ struct CpuOmp2Blocks { using Host = alpaka::acc::AccCpuOmp2Blocks; using Acc = alpaka::acc::AccCpuOmp2Blocks; - using DevHost = alpaka::dev::Dev; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Stream = alpaka::queue::QueueCpuBlocking; - using Event = alpaka::event::Event; using SmCount = alpaka::dim::DimInt<1u>; using MaxBlockSize = alpaka::dim::DimInt<1u>; }; @@ -92,12 +86,6 @@ struct CpuOmp4 { using Host = alpaka::acc::AccCpuSerial; using Acc = alpaka::acc::AccCpuOmp4; - using DevHost = alpaka::dev::Dev; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Stream = alpaka::queue::QueueCpuBlocking; - using Event = alpaka::event::Event; using MaxBlockSize = alpaka::dim::DimInt<1u>; }; @@ -118,12 +106,6 @@ struct CpuSerial { using Host = alpaka::acc::AccCpuSerial; using Acc = alpaka::acc::AccCpuSerial; - using DevHost = alpaka::dev::Dev; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Stream = alpaka::queue::QueueCpuBlocking; - using Event = alpaka::event::Event; using MaxBlockSize = alpaka::dim::DimInt<1u>; }; @@ -143,12 +125,6 @@ struct CpuThreads { using Host = alpaka::acc::AccCpuThreads; using Acc = alpaka::acc::AccCpuThreads; - using DevHost = alpaka::dev::Dev; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Stream = alpaka::queue::QueueCpuBlocking; - using Event = alpaka::event::Event; using MaxBlockSize = alpaka::dim::DimInt<1u>; }; @@ -165,16 +141,10 @@ struct GetIterator> //! CUDA defines //! //! Defines Host, Device, etc. for the CUDA/HIP accelerator. -struct GpuUniformCudaHipRt +struct GpuCudaRt { using Host = alpaka::acc::AccCpuSerial; - using Acc = alpaka::acc::AccGpuUniformCudaHipRt; - using DevHost = alpaka::dev::Dev; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Stream = alpaka::queue::QueueUniformCudaHipRtNonBlocking; - using Event = alpaka::event::Event; + using Acc = alpaka::acc::AccGpuCudaRt; using MaxBlockSize = alpaka::dim::DimInt<1024u>; }; diff --git a/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp b/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp index 15c3da3097..a4303f6a7b 100644 --- a/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp +++ b/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Jonas Schenke * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above diff --git a/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp b/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp index 63193a9976..e52b40a289 100644 --- a/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp +++ b/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Jonas Schenke * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above diff --git a/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp b/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp index 724979790a..15b43403d4 100644 --- a/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp +++ b/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Jonas Schenke, Matthias Werner * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -36,12 +36,10 @@ // using Accelerator = CpuSerial; -using DevAcc = Accelerator::DevAcc; -using DevHost = Accelerator::DevHost; -using QueueAcc = Accelerator::Stream; using Acc = Accelerator::Acc; -using PltfAcc = Accelerator::PltfAcc; -using PltfHost = Accelerator::PltfHost; +using Host = Accelerator::Host; +using QueueProperty = alpaka::queue::Blocking; +using QueueAcc = alpaka::queue::Queue; using MaxBlockSize = Accelerator::MaxBlockSize; //----------------------------------------------------------------------------- @@ -58,7 +56,7 @@ using MaxBlockSize = Accelerator::MaxBlockSize; //! \param func The reduction function. //! //! Returns true if the reduction was correct and false otherwise. -template +template T reduce(DevHost devHost, DevAcc devAcc, QueueAcc queue, uint64_t n, alpaka::mem::buf::Buf hostMemory, TFunc func) { static constexpr uint64_t blockSize = getMaxBlockSize(); @@ -66,7 +64,7 @@ T reduce(DevHost devHost, DevAcc devAcc, QueueAcc queue, uint64_t n, alpaka::mem // calculate optimal block size (8 times the MP count proved to be // relatively near to peak performance in benchmarks) uint32_t blockCount = static_cast( - alpaka::acc::getAccDevProps(devAcc).m_multiProcessorCount * + alpaka::acc::getAccDevProps(devAcc).m_multiProcessorCount * 8); uint32_t maxBlockCount = static_cast( (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize) @@ -135,14 +133,14 @@ int main() using T = uint32_t; static constexpr uint64_t blockSize = getMaxBlockSize(); - DevAcc devAcc(alpaka::pltf::getDevByIdx(dev)); - DevHost devHost(alpaka::pltf::getDevByIdx(0u)); + auto devAcc = alpaka::pltf::getDevByIdx(dev); + auto devHost = alpaka::pltf::getDevByIdx(0u); QueueAcc queue(devAcc); // calculate optimal block size (8 times the MP count proved to be // relatively near to peak performance in benchmarks) uint32_t blockCount = static_cast( - alpaka::acc::getAccDevProps(devAcc).m_multiProcessorCount * + alpaka::acc::getAccDevProps(devAcc).m_multiProcessorCount * 8); uint32_t maxBlockCount = static_cast( (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize) diff --git a/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt b/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt index 636b3935f2..11e45e289e 100644 --- a/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Jan Stephan # -# This file exemplifies usage of Alpaka. +# This file exemplifies usage of alpaka. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above @@ -35,9 +35,15 @@ project(${_TARGET_NAME}) # Find alpaka. if(NOT TARGET alpaka::alpaka) - set(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library") - list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}") - find_package(alpaka REQUIRED) + option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(USE_ALPAKA_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() endif() #------------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp b/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp index b14f0198ba..f7721a0531 100644 --- a/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp +++ b/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file exemplifies usage of Alpaka. + * This file exemplifies usage of alpaka. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -16,6 +16,7 @@ */ #include +#include #include #include @@ -79,6 +80,7 @@ auto main() #if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) return EXIT_SUCCESS; #else + // Define the index domain using Dim = alpaka::dim::DimInt<1u>; using Idx = std::size_t; @@ -88,6 +90,7 @@ auto main() // It is possible to choose from a set of accelerators // that are defined in the alpaka::acc namespace e.g.: // - AccGpuCudaRt + // - AccGpuHipRt // - AccCpuThreads // - AccCpuFibers // - AccCpuOmp2Threads @@ -95,9 +98,9 @@ auto main() // - AccCpuOmp4 // - AccCpuTbbBlocks // - AccCpuSerial - using Acc = alpaka::acc::AccCpuSerial; - using DevAcc = alpaka::dev::Dev; - using PltfAcc = alpaka::pltf::Pltf; + // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::example::ExampleDefaultAcc; + std::cout << "Using alpaka accelerator: " << alpaka::acc::getAccName() << std::endl; // Defines the synchronization behavior of a queue // @@ -106,7 +109,7 @@ auto main() using QueueAcc = alpaka::queue::Queue; // Select a device - DevAcc const devAcc(alpaka::pltf::getDevByIdx(0u)); + auto const devAcc = alpaka::pltf::getDevByIdx(0u); // Create a queue on the device QueueAcc queue(devAcc); @@ -130,8 +133,7 @@ auto main() // Get the host device for allocating memory on the host. using DevHost = alpaka::dev::DevCpu; - using PltfHost = alpaka::pltf::Pltf; - DevHost const devHost(alpaka::pltf::getDevByIdx(0u)); + auto const devHost = alpaka::pltf::getDevByIdx(0u); // Allocate 3 host memory buffers using BufHost = alpaka::mem::buf::Buf; @@ -144,7 +146,7 @@ auto main() Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB)); Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC)); - // C++11 random generator for uniformly distributed numbers in {1,..,42} + // C++14 random generator for uniformly distributed numbers in {1,..,42} std::random_device rd{}; std::default_random_engine eng{ rd() }; std::uniform_int_distribution dist(1, 42); @@ -157,7 +159,7 @@ auto main() } // Allocate 3 buffers on the accelerator - using BufAcc = alpaka::mem::buf::Buf; + using BufAcc = alpaka::mem::buf::Buf; BufAcc bufAccA(alpaka::mem::buf::alloc(devAcc, extent)); BufAcc bufAccB(alpaka::mem::buf::alloc(devAcc, extent)); BufAcc bufAccC(alpaka::mem::buf::alloc(devAcc, extent)); diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp index 98fde79cbb..9d7fd259e9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,11 +19,13 @@ #include #include #include -#include +#include #include #include +#include #include #include +#include // Specialized traits. #include @@ -76,11 +78,13 @@ namespace alpaka atomic::AtomicNoOp // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, + public block::shared::dyn::BlockSharedMemDynAlignedAlloc, public block::shared::st::BlockSharedMemStMasterSync, public block::sync::BlockSyncBarrierFiber, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeStdLib, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -108,7 +112,7 @@ namespace alpaka atomic::AtomicNoOp // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), + block::shared::dyn::BlockSharedMemDynAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), block::shared::st::BlockSharedMemStMasterSync( [this](){block::sync::syncBlockThreads(*this);}, [this](){return (m_masterFiberId == boost::this_fiber::get_id());}), @@ -168,8 +172,6 @@ namespace alpaka dev::DevCpu const & dev) -> alpaka::acc::AccDevProps { - alpaka::ignore_unused(dev); - #ifdef ALPAKA_CI auto const blockThreadCountMax(static_cast(3)); #else @@ -189,7 +191,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + dev::getMemBytes( dev )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp index e9ac16caa5..3bab307f16 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,11 +24,13 @@ #include #include #include -#include -#include +#include +#include #include +#include #include #include +#include // Specialized traits. #include @@ -77,11 +79,13 @@ namespace alpaka atomic::AtomicNoOp // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, - public block::shared::st::BlockSharedMemStNoSync, + public block::shared::dyn::BlockSharedMemDynMember<>, + public block::shared::st::BlockSharedMemStMember<>, public block::sync::BlockSyncNoOp, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeOmp, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -109,8 +113,8 @@ namespace alpaka atomic::AtomicNoOp // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), - block::shared::st::BlockSharedMemStNoSync(), + block::shared::dyn::BlockSharedMemDynMember<>(static_cast(blockSharedMemDynSizeBytes)), + block::shared::st::BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity()), block::sync::BlockSyncNoOp(), rand::RandStdLib(), time::TimeOmp(), @@ -178,7 +182,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + static_cast( acc::AccCpuOmp2Blocks::staticAllocBytes )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp index 1bff0f9ea1..16f62ec12f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -23,11 +23,13 @@ #include #include #include -#include +#include #include #include +#include #include #include +#include // Specialized traits. #include @@ -78,11 +80,13 @@ namespace alpaka atomic::AtomicOmpBuiltIn // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, + public block::shared::dyn::BlockSharedMemDynAlignedAlloc, public block::shared::st::BlockSharedMemStMasterSync, public block::sync::BlockSyncBarrierOmp, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeOmp, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -110,7 +114,7 @@ namespace alpaka atomic::AtomicOmpBuiltIn // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), + block::shared::dyn::BlockSharedMemDynAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), block::shared::st::BlockSharedMemStMasterSync( [this](){block::sync::syncBlockThreads(*this);}, [](){return (::omp_get_thread_num() == 0);}), @@ -165,8 +169,6 @@ namespace alpaka dev::DevCpu const & dev) -> alpaka::acc::AccDevProps { - alpaka::ignore_unused(dev); - #ifdef ALPAKA_CI auto const blockThreadCountMax(alpaka::core::clipCast(std::min(4, ::omp_get_max_threads()))); #else @@ -186,7 +188,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + dev::getMemBytes( dev )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp index 2a3236dcb8..ad45488d1e 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -23,11 +23,13 @@ #include #include #include -#include +#include #include #include +#include #include #include +#include // Specialized traits. #include @@ -78,11 +80,13 @@ namespace alpaka atomic::AtomicOmpBuiltIn // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, + public block::shared::dyn::BlockSharedMemDynAlignedAlloc, public block::shared::st::BlockSharedMemStMasterSync, public block::sync::BlockSyncBarrierOmp, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeOmp, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -110,7 +114,7 @@ namespace alpaka atomic::AtomicOmpBuiltIn // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), + block::shared::dyn::BlockSharedMemDynAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), block::shared::st::BlockSharedMemStMasterSync( [this](){block::sync::syncBlockThreads(*this);}, [](){return (::omp_get_thread_num() == 0);}), @@ -165,8 +169,6 @@ namespace alpaka dev::DevCpu const & dev) -> acc::AccDevProps { - alpaka::ignore_unused(dev); - #ifdef ALPAKA_CI auto const blockThreadCountMax(alpaka::core::clipCast(std::min(4, ::omp_get_max_threads()))); #else @@ -186,7 +188,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + dev::getMemBytes( dev )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp index ca92f59fd1..3b70877f2a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,11 +19,13 @@ #include #include #include -#include -#include +#include +#include #include +#include #include #include +#include // Specialized traits. #include @@ -71,11 +73,13 @@ namespace alpaka atomic::AtomicNoOp // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, - public block::shared::st::BlockSharedMemStNoSync, + public block::shared::dyn::BlockSharedMemDynMember<>, + public block::shared::st::BlockSharedMemStMember<>, public block::sync::BlockSyncNoOp, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeStdLib, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -103,8 +107,8 @@ namespace alpaka atomic::AtomicNoOp // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), - block::shared::st::BlockSharedMemStNoSync(), + block::shared::dyn::BlockSharedMemDynMember<>(static_cast(blockSharedMemDynSizeBytes)), + block::shared::st::BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity()), block::sync::BlockSyncNoOp(), rand::RandStdLib(), time::TimeStdLib(), @@ -172,7 +176,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + static_cast< size_t >( acc::AccCpuSerial::staticAllocBytes )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp index 07dfa8eea4..00b2e189b1 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,11 +19,13 @@ #include #include #include -#include -#include +#include +#include #include +#include #include #include +#include // Specialized traits. #include @@ -69,11 +71,13 @@ namespace alpaka atomic::AtomicNoOp // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, - public block::shared::st::BlockSharedMemStNoSync, + public block::shared::dyn::BlockSharedMemDynMember<>, + public block::shared::st::BlockSharedMemStMember<>, public block::sync::BlockSyncNoOp, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeStdLib, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -101,8 +105,8 @@ namespace alpaka atomic::AtomicNoOp // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), - block::shared::st::BlockSharedMemStNoSync(), + block::shared::dyn::BlockSharedMemDynMember<>(static_cast(blockSharedMemDynSizeBytes)), + block::shared::st::BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity()), block::sync::BlockSyncNoOp(), rand::RandStdLib(), time::TimeStdLib(), @@ -170,7 +174,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + static_cast< size_t >( acc::AccCpuTbbBlocks::staticAllocBytes )}; } }; diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp index 902674be30..a40141d690 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -18,11 +18,13 @@ #include #include #include -#include +#include #include #include +#include #include #include +#include // Specialized traits. #include @@ -73,11 +75,13 @@ namespace alpaka atomic::AtomicStdLibLock<16> // thread atomics >, public math::MathStdLib, - public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc, + public block::shared::dyn::BlockSharedMemDynAlignedAlloc, public block::shared::st::BlockSharedMemStMasterSync, public block::sync::BlockSyncBarrierThread, + public intrinsic::IntrinsicCpu, public rand::RandStdLib, public time::TimeStdLib, + public warp::WarpSingleThread, public concepts::Implements> { public: @@ -105,7 +109,7 @@ namespace alpaka atomic::AtomicStdLibLock<16> // atomics between threads >(), math::MathStdLib(), - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), + block::shared::dyn::BlockSharedMemDynAlignedAlloc(static_cast(blockSharedMemDynSizeBytes)), block::shared::st::BlockSharedMemStMasterSync( [this](){block::sync::syncBlockThreads(*this);}, [this](){return (m_idMasterThread == std::this_thread::get_id());}), @@ -166,8 +170,6 @@ namespace alpaka dev::DevCpu const & dev) -> acc::AccDevProps { - alpaka::ignore_unused(dev); - #ifdef ALPAKA_CI auto const blockThreadCountMax(static_cast(8)); #else @@ -189,7 +191,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max()}; + std::numeric_limits::max(), + // m_sharedMemSizeBytes + dev::getMemBytes( dev )}; } }; //############################################################################# diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp index e561453dd8..856f87bcaa 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,7 +24,6 @@ namespace alpaka // // \TODO: // TIdx m_maxClockFrequencyHz; //!< Maximum clock frequency of the device in Hz. - // TIdx m_sharedMemSizeBytes; //!< Idx of the available block shared memory in bytes. template< typename TDim, typename TIdx> @@ -38,14 +37,16 @@ namespace alpaka vec::Vec const & blockThreadExtentMax, TIdx const & blockThreadCountMax, vec::Vec const & threadElemExtentMax, - TIdx const & threadElemCountMax) : + TIdx const & threadElemCountMax, + size_t const & sharedMemSizeBytes) : m_gridBlockExtentMax(gridBlockExtentMax), m_blockThreadExtentMax(blockThreadExtentMax), m_threadElemExtentMax(threadElemExtentMax), m_gridBlockCountMax(gridBlockCountMax), m_blockThreadCountMax(blockThreadCountMax), m_threadElemCountMax(threadElemCountMax), - m_multiProcessorCount(multiProcessorCount) + m_multiProcessorCount(multiProcessorCount), + m_sharedMemSizeBytes(sharedMemSizeBytes) {} // NOTE: The members have been reordered from the order in the constructor because gcc is buggy for some TDim and TIdx and generates invalid assembly. @@ -58,6 +59,7 @@ namespace alpaka TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads. TIdx m_multiProcessorCount; //!< The number of multiprocessors. + size_t m_sharedMemSizeBytes; //!< The size of shared memory per block }; } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp index 341ee756b9..769644dbb6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -55,8 +55,7 @@ namespace alpaka typename TIdx> class AccGpuCudaRt final : public acc::AccGpuUniformCudaHipRt, - public concepts::Implements>, - public concepts::Implements> + public concepts::Implements> { public: //----------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp index 60af212503..98fe667471 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -54,8 +54,7 @@ namespace alpaka typename TIdx> class AccGpuHipRt final : public acc::AccGpuUniformCudaHipRt, - public concepts::Implements>, - public concepts::Implements> + public concepts::Implements> { public: //----------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp index 69eb9ae3c8..6f5aec73d8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -31,8 +31,10 @@ #include #include #include +#include #include #include +#include // Specialized traits. #include @@ -82,8 +84,11 @@ namespace alpaka public block::shared::dyn::BlockSharedMemDynUniformCudaHipBuiltIn, public block::shared::st::BlockSharedMemStUniformCudaHipBuiltIn, public block::sync::BlockSyncUniformCudaHipBuiltIn, + public intrinsic::IntrinsicUniformCudaHipBuiltIn, public rand::RandUniformCudaHipRand, - public time::TimeUniformCudaHipBuiltIn + public time::TimeUniformCudaHipBuiltIn, + public warp::WarpUniformCudaHipBuiltIn, + public concepts::Implements> { public: //----------------------------------------------------------------------------- @@ -192,6 +197,12 @@ namespace alpaka cudaDevAttrMaxThreadsPerBlock, dev.m_iDevice)); + int sharedMemSizeBytes = {}; + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( + &sharedMemSizeBytes, + cudaDevAttrMaxSharedMemoryPerBlock, + dev.m_iDevice)); + return { // m_multiProcessorCount alpaka::core::clipCast(multiProcessorCount), @@ -214,7 +225,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max() + std::numeric_limits::max(), + // m_sharedMemSizeBytes + static_cast(sharedMemSizeBytes) }; #else @@ -245,7 +258,9 @@ namespace alpaka // m_threadElemExtentMax vec::Vec::all(std::numeric_limits::max()), // m_threadElemCountMax - std::numeric_limits::max() + std::numeric_limits::max(), + // m_sharedMemSizeBytes + static_cast(hipDevProp.sharedMemPerBlock) }; #endif } @@ -307,7 +322,7 @@ namespace alpaka //! specialization of the TKernelFnObj return type evaluation // // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side. - // https://github.com/ComputationalRadiationPhysics/alpaka/pull/695#issuecomment-446103194 + // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194 // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side. template< typename TDim, diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp index 1d2e4b2ca6..4f761e19f4 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -102,9 +102,10 @@ namespace alpaka TDev const & dev) -> AccDevProps, idx::Idx> { + using ImplementationBase = concepts::ImplementationBase; return traits::GetAccDevProps< - TAcc> + ImplementationBase> ::getAccDevProps( dev); } diff --git a/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp b/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp index f1c0e195aa..dbe09678a4 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -45,13 +45,15 @@ //----------------------------------------------------------------------------- // dynamic #include - #include + #include + #include #include //----------------------------------------------------------------------------- // static #include #include #include + #include #include //----------------------------------------------------------------------------- // sync @@ -65,6 +67,7 @@ // core #include #include +#include #include #include #include @@ -128,7 +131,7 @@ #include //----------------------------------------------------------------------------- // mem -#include +#include #include #include diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp index 8f9141fb88..59a4ceb76b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -27,7 +27,7 @@ namespace alpaka // This implementation inherit from all three hierarchies. // The multiple usage of the same type for different levels is allowed. // The class provide the feature that each atomic operation can be focused - // to a hierarchy level in Alpaka. A operation to a hierarchy is independent + // to a hierarchy level in alpaka. A operation to a hierarchy is independent // to the memory hierarchy. // // \tparam TGridAtomic atomic implementation for atomic operations between grids within a device diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp index 997b98229b..190fade370 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp index 02cba6acfd..f0d331d1b7 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp index 816149f1b0..32169409a0 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp index 400c5efa97..01c1846a0d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp index d7aa18bc01..f5cdd137df 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp index 3d6f120ee7..ba39e7ff04 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp similarity index 74% rename from thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp rename to thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp index 9206f1752c..6b84800341 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,10 +12,9 @@ #include #include +#include #include -#include - #include #include @@ -29,35 +28,35 @@ namespace alpaka { //############################################################################# //! The block shared dynamic memory allocator without synchronization. - class BlockSharedMemDynBoostAlignedAlloc : public concepts::Implements + class BlockSharedMemDynAlignedAlloc : public concepts::Implements { public: //----------------------------------------------------------------------------- - BlockSharedMemDynBoostAlignedAlloc( + BlockSharedMemDynAlignedAlloc( std::size_t const & blockSharedMemDynSizeBytes) { if(blockSharedMemDynSizeBytes > 0u) { m_blockSharedMemDyn.reset( reinterpret_cast( - boost::alignment::aligned_alloc(core::vectorization::defaultAlignment, blockSharedMemDynSizeBytes))); + core::alignedAlloc(core::vectorization::defaultAlignment, blockSharedMemDynSizeBytes))); } } //----------------------------------------------------------------------------- - BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc const &) = delete; + BlockSharedMemDynAlignedAlloc(BlockSharedMemDynAlignedAlloc const &) = delete; //----------------------------------------------------------------------------- - BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc &&) = delete; + BlockSharedMemDynAlignedAlloc(BlockSharedMemDynAlignedAlloc &&) = delete; //----------------------------------------------------------------------------- - auto operator=(BlockSharedMemDynBoostAlignedAlloc const &) -> BlockSharedMemDynBoostAlignedAlloc & = delete; + auto operator=(BlockSharedMemDynAlignedAlloc const &) -> BlockSharedMemDynAlignedAlloc & = delete; //----------------------------------------------------------------------------- - auto operator=(BlockSharedMemDynBoostAlignedAlloc &&) -> BlockSharedMemDynBoostAlignedAlloc & = delete; + auto operator=(BlockSharedMemDynAlignedAlloc &&) -> BlockSharedMemDynAlignedAlloc & = delete; //----------------------------------------------------------------------------- - /*virtual*/ ~BlockSharedMemDynBoostAlignedAlloc() = default; + /*virtual*/ ~BlockSharedMemDynAlignedAlloc() = default; public: std::unique_ptr< uint8_t, - boost::alignment::aligned_delete> mutable + core::AlignedDelete> mutable m_blockSharedMemDyn; //!< Block shared dynamic memory. }; @@ -72,11 +71,11 @@ namespace alpaka typename T> struct GetMem< T, - BlockSharedMemDynBoostAlignedAlloc> + BlockSharedMemDynAlignedAlloc> { //----------------------------------------------------------------------------- ALPAKA_FN_HOST static auto getMem( - block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc const & blockSharedMemDyn) + block::shared::dyn::BlockSharedMemDynAlignedAlloc const & blockSharedMemDyn) -> T * { static_assert( diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp new file mode 100644 index 0000000000..fbcd1db54e --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp @@ -0,0 +1,123 @@ +/* Copyright 2020 Jeffrey Kelling + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB +#define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 30 +#endif + +namespace alpaka +{ + namespace block + { + namespace shared + { + namespace dyn + { +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) + #pragma warning(push) + #pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier +#endif + //############################################################################# + //! Dynamic block shared memory provider using fixed-size + //! member array to allocate memory on the stack or in shared + //! memory. + template + class alignas(core::vectorization::defaultAlignment) BlockSharedMemDynMember : + public concepts::Implements> + { + public: + //----------------------------------------------------------------------------- + BlockSharedMemDynMember(unsigned int sizeBytes) + : m_dynPitch((sizeBytes/core::vectorization::defaultAlignment + + (sizeBytes%core::vectorization::defaultAlignment>0))*core::vectorization::defaultAlignment) + { +#if (defined ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) && (! defined NDEBUG) + ALPAKA_ASSERT(sizeBytes <= staticAllocBytes); +#endif + } + //----------------------------------------------------------------------------- + BlockSharedMemDynMember(BlockSharedMemDynMember const &) = delete; + //----------------------------------------------------------------------------- + BlockSharedMemDynMember(BlockSharedMemDynMember &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(BlockSharedMemDynMember const &) -> BlockSharedMemDynMember & = delete; + //----------------------------------------------------------------------------- + auto operator=(BlockSharedMemDynMember &&) -> BlockSharedMemDynMember & = delete; + //----------------------------------------------------------------------------- + /*virtual*/ ~BlockSharedMemDynMember() = default; + + uint8_t* dynMemBegin() const {return m_mem.data();} + + /*! \return the pointer to the begin of data after the portion allocated as dynamical shared memory. + */ + uint8_t* staticMemBegin() const + { + return m_mem.data() + m_dynPitch; + } + + /*! \return the remaining capacity for static block shared memory. + */ + unsigned int staticMemCapacity() const + { + return staticAllocBytes - m_dynPitch; + } + + //! Storage size in bytes + static constexpr unsigned int staticAllocBytes = TStaticAllocKiB<<10; + + private: + mutable std::array m_mem; + unsigned int m_dynPitch; + }; +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) + #pragma warning(pop) +#endif + + namespace traits + { + //############################################################################# + template< + typename T, + unsigned int TStaticAllocKiB> + struct GetMem< + T, + BlockSharedMemDynMember> + { +#if BOOST_COMP_GNUC + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type" +#endif + //----------------------------------------------------------------------------- + static auto getMem( + block::shared::dyn::BlockSharedMemDynMember const &mem) + -> T * + { + static_assert( + core::vectorization::defaultAlignment >= alignof(T), + "Unable to get block shared dynamic memory for types with alignment higher than defaultAlignment!"); + return reinterpret_cast(mem.dynMemBegin()); + } +#if BOOST_COMP_GNUC + #pragma GCC diagnostic pop +#endif + }; + } + } + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp index f647b4f7f1..24ee8d44c2 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp index 27445abb68..3069e2969b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp index e0959ea29b..85615144b6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,10 +12,9 @@ #include #include +#include #include -#include - #include #include #include @@ -57,7 +56,7 @@ namespace alpaka std::vector< std::unique_ptr< uint8_t, - boost::alignment::aligned_delete>> mutable + core::AlignedDelete>> mutable m_sharedAllocs; std::function m_syncFn; @@ -93,7 +92,7 @@ namespace alpaka { blockSharedMemSt.m_sharedAllocs.emplace_back( reinterpret_cast( - boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T)))); + core::alignedAlloc(alignmentInBytes, sizeof(T)))); } blockSharedMemSt.m_syncFn(); diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp new file mode 100644 index 0000000000..c357f65f48 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp @@ -0,0 +1,153 @@ +/* Copyright 2020 Jeffrey Kelling + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace alpaka +{ + namespace block + { + namespace shared + { + namespace st + { + namespace detail + { + //############################################################################# + //! Implementation of static block shared memory provider. + template + class BlockSharedMemStMemberImpl + { + public: + //----------------------------------------------------------------------------- +#ifndef NDEBUG + BlockSharedMemStMemberImpl(uint8_t* mem, unsigned int capacity) : m_mem(mem), m_capacity(capacity) {} +#else + BlockSharedMemStMemberImpl(uint8_t* mem, unsigned int) : m_mem(mem) {} +#endif + //----------------------------------------------------------------------------- + BlockSharedMemStMemberImpl(BlockSharedMemStMemberImpl const &) = delete; + //----------------------------------------------------------------------------- + BlockSharedMemStMemberImpl(BlockSharedMemStMemberImpl &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(BlockSharedMemStMemberImpl const &) -> BlockSharedMemStMemberImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(BlockSharedMemStMemberImpl &&) -> BlockSharedMemStMemberImpl & = delete; + //----------------------------------------------------------------------------- + /*virtual*/ ~BlockSharedMemStMemberImpl() = default; + + template + void alloc() const + { + m_allocdBytes = allocPitch(); + uint8_t* buf = &m_mem[m_allocdBytes]; + new (buf) T(); + m_allocdBytes += sizeof(T); +#if (defined ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) && (! defined NDEBUG) + ALPAKA_ASSERT(m_allocdBytes < m_capacity); +#endif + } + +#if BOOST_COMP_GNUC + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type" +#endif + template + T& getLatestVar() const + { + return *reinterpret_cast(&m_mem[m_allocdBytes-sizeof(T)]); + } +#if BOOST_COMP_GNUC + #pragma GCC diagnostic pop +#endif + + void free() const + { + m_allocdBytes = 0u; + } + + private: + mutable unsigned int m_allocdBytes = 0; + mutable uint8_t* m_mem; +#ifndef NDEBUG + const unsigned int m_capacity; +#endif + + template + unsigned int allocPitch() const + { + static_assert( + core::vectorization::defaultAlignment >= alignof(T), + "Unable to get block shared static memory for types with alignment higher than defaultAlignment!"); + constexpr unsigned int align = std::max(TDataAlignBytes, static_cast(alignof(T))); + return (m_allocdBytes/align + (m_allocdBytes%align>0))*align; + } + }; + } + //############################################################################# + //! Static block shared memory provider using a pointer to + //! externally allocated fixed-size memory, likely provided by + //! BlockSharedMemDynMember. + template + class BlockSharedMemStMember : + public detail::BlockSharedMemStMemberImpl, + public concepts::Implements> + { + public: + using detail::BlockSharedMemStMemberImpl::BlockSharedMemStMemberImpl; + }; + + namespace traits + { + //############################################################################# + template< + typename T, + unsigned int TDataAlignBytes, + std::size_t TuniqueId> + struct AllocVar< + T, + TuniqueId, + BlockSharedMemStMember> + { + //----------------------------------------------------------------------------- + static auto allocVar( + block::shared::st::BlockSharedMemStMember const &smem) + -> T & + { + smem.template alloc(); + return smem.template getLatestVar(); + } + }; + //############################################################################# + template< + unsigned int TDataAlignBytes> + struct FreeMem< + BlockSharedMemStMember> + { + //----------------------------------------------------------------------------- + static auto freeMem( + block::shared::st::BlockSharedMemStMember const &mem) + -> void + { + mem.free(); + } + }; + } + } + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp index 0ebf72a052..2b3955360d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,10 +12,9 @@ #include #include +#include #include -#include - #include #include @@ -51,7 +50,7 @@ namespace alpaka std::vector< std::unique_ptr< uint8_t, - boost::alignment::aligned_delete>> mutable + core::AlignedDelete>> mutable m_sharedAllocs; }; @@ -79,7 +78,7 @@ namespace alpaka blockSharedMemSt.m_sharedAllocs.emplace_back( reinterpret_cast( - boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T)))); + core::alignedAlloc(alignmentInBytes, sizeof(T)))); return std::ref( *reinterpret_cast( diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp index 98304b8a2a..482df65169 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp index 56c5644b72..ed92990fd1 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp index 0d8e2d6b2f..00e1c4e80c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp index 1676b4b51d..780e851bea 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp index 8f8755a1d9..c77c4aa373 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp index d93c9acf90..c7b8ef427d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp index 238475d0ea..544747c98d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp index a74b9c0959..e40c83aca7 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp index ac34cc7f9c..1d1e5c2a25 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp new file mode 100644 index 0000000000..77af609ccb --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp @@ -0,0 +1,72 @@ +/* Copyright 2020 René Widera + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include + +#if BOOST_COMP_MSVC + #include +#else + #include +#endif + +namespace alpaka +{ + namespace core + { + //----------------------------------------------------------------------------- + //! Rounds to the next higher power of two (if not already power of two). + // Adapted from llvm/ADT/SmallPtrSet.h + ALPAKA_FN_INLINE ALPAKA_FN_HOST + void* alignedAlloc(size_t alignment, size_t size) + { +#if BOOST_OS_WINDOWS + return _aligned_malloc(size, alignment); +#elif BOOST_OS_MACOS + void * ptr = nullptr; + posix_memalign(&ptr, alignment, size); + return ptr; +#else + // the amount of bytes to allocate must be a multiple of the alignment + size_t sizeToAllocate = ((size + alignment - 1u) / alignment) * alignment; + return ::aligned_alloc(alignment, sizeToAllocate); +#endif + } + + ALPAKA_FN_INLINE ALPAKA_FN_HOST + void alignedFree(void* ptr) + { +#if BOOST_OS_WINDOWS + _aligned_free(ptr); +#else + // linux and macos + free(ptr); +#endif + } + + //############################################################################# + //! destroy aligned object and free aligned memory + struct AlignedDelete + { + constexpr AlignedDelete() = default; + + //----------------------------------------------------------------------------- + //! Calls ~T() on ptr to destroy the object and then calls aligned_free to free the allocated memory. + template + void operator()(T* ptr) const + { + if (ptr) + ptr->~T(); + alignedFree(reinterpret_cast(ptr)); + } + }; + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp index faed43b616..6eb901dc1d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp index 8853b0970f..276d746683 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp index 6d25d46667..0f04c4deb2 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -119,3 +119,19 @@ #else #define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE #endif + +//----------------------------------------------------------------------------- +// Intel compiler detection +// BOOST_COMP_INTEL_EMULATED is defined by boost instead of BOOST_COMP_INTEL +#if defined(BOOST_COMP_INTEL) && defined(BOOST_COMP_INTEL_EMULATED) + #undef BOOST_COMP_INTEL + #define BOOST_COMP_INTEL BOOST_COMP_INTEL_EMULATED +#endif + +//----------------------------------------------------------------------------- +// PGI and NV HPC SDK compiler detection +// BOOST_COMP_PGI_EMULATED is defined by boost instead of BOOST_COMP_PGI +#if defined(BOOST_COMP_PGI) && defined(BOOST_COMP_PGI_EMULATED) + #undef BOOST_COMP_PGI + #define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED +#endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp index 1202c5c397..f48d8ad92e 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp index e71d5296f5..b2c390836b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -17,23 +17,6 @@ #include #endif -//----------------------------------------------------------------------------- -// Boost does not yet correctly identify clang when compiling CUDA code. -// After explicitly including we can safely undefine some of the wrong settings. -#if BOOST_COMP_CLANG_CUDA - #include - #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES -#endif - -//----------------------------------------------------------------------------- -// Boost disables variadic templates for nvcc (in some cases because it was buggy). -// However, we rely on it being enabled. -// After explicitly including we can safely undefine the wrong setting. -#if BOOST_COMP_NVCC - #include - #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES -#endif - //----------------------------------------------------------------------------- //! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC. //! @@ -66,7 +49,7 @@ //! WARNING: Only use this method if there is no other way. //! Most cases can be solved by #if BOOST_ARCH_PTX or #if BOOST_LANG_CUDA. #if (BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA) || BOOST_LANG_HIP - #if BOOST_COMP_MSVC + #if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable) #else #define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable") diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp index 45a0ba4e96..27a8092f41 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp index 0e788496aa..0264ffa987 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,8 +16,7 @@ // Therefore, we can not even parse those parts when compiling device code. //----------------------------------------------------------------------------- #include - -#include +#include #include #include diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp index 098debafd1..a8ccb6ee40 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp index def86d5e56..3429f26caa 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -42,7 +42,7 @@ namespace alpaka { public: //----------------------------------------------------------------------------- - ScopeLogStdOut( + explicit ScopeLogStdOut( std::string const & sScope) : m_sScope(sScope) { diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp new file mode 100644 index 0000000000..c244b6b2a7 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp @@ -0,0 +1,33 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#include + +//----------------------------------------------------------------------------- +//! Wrapper around std::decay_t for parameter pack expansion expressions +// +// Works around Intel compiler internal error when used in empty template pack +// extension as discussed in #995. It seems not possible to make a workaround +// with pure C++ tools, like an alias template, so macro it is. Note that +// there is no known issue outside of empty parameter pack expansions, +// so the normal std::decay_t can and should be used there. +// +// The choice of macro over writing typename std::decay::type explicitly +// in parameter pack expansion expressions is to avoid warnings from diagnostic +// tools, and also for brevity. +//----------------------------------------------------------------------------- +#ifdef BOOST_COMP_INTEL + #define ALPAKA_DECAY_T(Type) typename std::decay::type +#else + #define ALPAKA_DECAY_T(Type) std::decay_t +#endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp index a9a156b19c..f2b89bb651 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp index 35d6a48a44..15e6b3ed18 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp index 4147f8ccb1..288c490cf5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,7 +12,7 @@ namespace alpaka { //############################################################################# - //! Defines the parallelism hierarchy levels of Alpaka + //! Defines the parallelism hierarchy levels of alpaka namespace hierarchy { struct Grids{}; diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp index 44ad40de4d..285af41a81 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -90,6 +90,11 @@ namespace alpaka { rtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line); } + else + { + // reset the last error to avoid propagation to the next CUDA/HIP API call + ALPAKA_API_PREFIX(GetLastError)(); + } } } //----------------------------------------------------------------------------- @@ -107,7 +112,7 @@ namespace alpaka } } -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) //----------------------------------------------------------------------------- //! CUDA runtime error checking with log and exception, ignoring specific error values #define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)\ diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp index 829be5ca7f..3bf382fa8b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,7 +19,7 @@ //! for(...){...}` // \TODO: Implement for other compilers. #if BOOST_ARCH_PTX - #if BOOST_COMP_MSVC + #if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #define ALPAKA_UNROLL(...) __pragma(unroll __VA_ARGS__) #else #define ALPAKA_UNROLL_STRINGIFY(x) #x diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp index 76fe3aa06f..ce63e9fb70 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -11,13 +11,11 @@ #include -#include - namespace alpaka { ALPAKA_NO_HOST_ACC_WARNING template< typename... Ts > - BOOST_FORCEINLINE + ALPAKA_FN_INLINE constexpr ALPAKA_FN_HOST_ACC void @@ -26,7 +24,7 @@ namespace alpaka ALPAKA_NO_HOST_ACC_WARNING template< typename... Ts > - BOOST_FORCEINLINE + ALPAKA_FN_INLINE constexpr ALPAKA_FN_HOST_ACC void diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp index ddf03f33ae..aa3530b72c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp index a90ef0a4f5..f797539874 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp index 9fa0b6650e..cfc2b0ce47 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,13 +14,16 @@ #include #include -#include +#include #include #include #include #include +#include +#include + #include #include #include @@ -29,18 +32,15 @@ namespace alpaka { + namespace dev + { + class DevCpu; + } namespace queue { - class QueueCpuNonBlocking; - class QueueCpuBlocking; - namespace cpu { - namespace detail - { - class QueueCpuNonBlockingImpl; - class QueueCpuBlockingImpl; - } + using ICpuQueue = IGenericThreadsQueue; } } namespace pltf @@ -66,18 +66,30 @@ namespace alpaka //! The CPU device implementation. class DevCpuImpl { - private: + public: + //----------------------------------------------------------------------------- + DevCpuImpl() = default; + //----------------------------------------------------------------------------- + DevCpuImpl(DevCpuImpl const &) = delete; + //----------------------------------------------------------------------------- + DevCpuImpl(DevCpuImpl &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(DevCpuImpl const &) -> DevCpuImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(DevCpuImpl &&) -> DevCpuImpl & = delete; + //----------------------------------------------------------------------------- + ~DevCpuImpl() = default; //----------------------------------------------------------------------------- - ALPAKA_FN_HOST auto GetAllQueueImpls( - std::vector> & queues) const + ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector> { std::vector> vspQueues; std::lock_guard lk(m_Mutex); + vspQueues.reserve(m_queues.size()); - for(auto it = queues.begin(); it != queues.end();) + for(auto it = m_queues.begin(); it != m_queues.end();) { auto spQueue(it->lock()); if(spQueue) @@ -87,36 +99,16 @@ namespace alpaka } else { - it = queues.erase(it); + it = m_queues.erase(it); } } return vspQueues; } - public: - //----------------------------------------------------------------------------- - DevCpuImpl() = default; - //----------------------------------------------------------------------------- - DevCpuImpl(DevCpuImpl const &) = delete; - //----------------------------------------------------------------------------- - DevCpuImpl(DevCpuImpl &&) = delete; - //----------------------------------------------------------------------------- - auto operator=(DevCpuImpl const &) -> DevCpuImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(DevCpuImpl &&) -> DevCpuImpl & = delete; - //----------------------------------------------------------------------------- - ~DevCpuImpl() = default; - - ALPAKA_FN_HOST auto GetAllQueues() const - -> std::vector> - { - return GetAllQueueImpls(m_queues); - } - //----------------------------------------------------------------------------- //! Registers the given queue on this device. //! NOTE: Every queue has to be registered for correct functionality of device wait operations! - ALPAKA_FN_HOST auto RegisterQueue(std::shared_ptr spQueue) + ALPAKA_FN_HOST auto registerQueue(std::shared_ptr spQueue) const -> void { std::lock_guard lk(m_Mutex); @@ -134,7 +126,9 @@ namespace alpaka //############################################################################# //! The CPU device handle. - class DevCpu : public concepts::Implements + class DevCpu : + public concepts::Implements, + public concepts::Implements { friend struct pltf::traits::GetDevByIdx; protected: @@ -166,6 +160,21 @@ namespace alpaka //----------------------------------------------------------------------------- ~DevCpu() = default; + ALPAKA_FN_HOST auto getAllQueues() const + -> std::vector> + { + return m_spDevCpuImpl->getAllExistingQueues(); + } + + //----------------------------------------------------------------------------- + //! Registers the given queue on this device. + //! NOTE: Every queue has to be registered for correct functionality of device wait operations! + ALPAKA_FN_HOST auto registerQueue(std::shared_ptr spQueue) const + -> void + { + m_spDevCpuImpl->registerQueue(spQueue); + } + public: std::shared_ptr m_spDevCpuImpl; }; @@ -226,6 +235,23 @@ namespace alpaka } }; + //############################################################################# + //! The CPU device warp size get trait specialization. + template<> + struct GetWarpSize< + dev::DevCpu> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getWarpSize( + dev::DevCpu const & dev) + -> std::size_t + { + alpaka::ignore_unused(dev); + + return 1u; + } + }; + //############################################################################# //! The CPU device reset trait specialization. template<> @@ -291,6 +317,9 @@ namespace alpaka } namespace queue { + using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking; + using QueueCpuBlocking = QueueGenericThreadsBlocking; + namespace traits { template<> diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp index e6b69ac2f5..b66e76a0b9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -59,7 +59,9 @@ namespace alpaka { //############################################################################# //! The CUDA/HIP RT device handle. - class DevUniformCudaHipRt : public concepts::Implements + class DevUniformCudaHipRt : + public concepts::Implements, + public concepts::Implements { friend struct pltf::traits::GetDevByIdx; @@ -187,6 +189,31 @@ namespace alpaka } }; + //############################################################################# + //! The CUDA/HIP RT device warp size get trait specialization. + template<> + struct GetWarpSize< + dev::DevUniformCudaHipRt> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getWarpSize( + dev::DevUniformCudaHipRt const & dev) + -> std::size_t + { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaDeviceProp devProp; +#else + hipDeviceProp_t devProp; +#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(GetDeviceProperties)( + &devProp, + dev.m_iDevice)); + + return static_cast(devProp.warpSize); + } + }; + //############################################################################# //! The CUDA/HIP RT device reset trait specialization. template<> diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp index 165e80d6d2..85fed443b0 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -11,7 +11,7 @@ #include -#include +#include namespace alpaka { @@ -58,6 +58,13 @@ namespace alpaka typename TSfinae = void> struct GetFreeMemBytes; + //############################################################################# + //! The device warp size get trait. + template< + typename T, + typename TSfinae = void> + struct GetWarpSize; + //############################################################################# //! The device reset trait. template< @@ -72,6 +79,10 @@ namespace alpaka typename T> using Dev = typename traits::DevType::type; + struct ConceptGetDev; + + struct ConceptDev; + //----------------------------------------------------------------------------- //! \return The device this object is bound to. template< @@ -79,9 +90,10 @@ namespace alpaka ALPAKA_FN_HOST auto getDev( T const & t) { + using ImplementationBase = concepts::ImplementationBase; return traits::GetDev< - T> + ImplementationBase> ::getDev( t); } @@ -131,6 +143,21 @@ namespace alpaka dev); } + //----------------------------------------------------------------------------- + //! \return The warp size on the device in number of threads. + template< + typename TDev> + ALPAKA_FN_HOST auto getWarpSize( + TDev const & dev) + -> std::size_t + { + return + traits::GetWarpSize< + TDev> + ::getWarpSize( + dev); + } + //----------------------------------------------------------------------------- //! Resets the device. //! What this method does is dependent on the accelerator. @@ -145,5 +172,20 @@ namespace alpaka ::reset( dev); } + + namespace traits + { + //############################################################################# + //! Get device type + template< + typename TDev> + struct DevType< + TDev, + typename std::enable_if::value>::type + > + { + using type = typename concepts::ImplementationBase; + }; + } } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp index a9ed801343..4ebe932c59 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -56,7 +56,7 @@ namespace alpaka __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]); } - #elif BOOST_COMP_MSVC || defined(__INTEL_COMPILER) + #elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) || defined(__INTEL_COMPILER) #include //----------------------------------------------------------------------------- inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4]) diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp index 4d95fc30c1..f0e70dac90 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Rene Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -20,29 +20,6 @@ namespace alpaka { namespace traits { - namespace detail - { - template - ALPAKA_FN_HOST auto currentThreadWaitForDevice( - TDevice const & dev, TQueueVector & vQueues - ) - ->void - { - // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events! - std::vector vEvents; - for(auto && spQueue : vQueues) - { - vEvents.emplace_back(dev); - spQueue->enqueue(vEvents.back()); - } - - // Now wait for all the events. - for(auto && event : vEvents) - { - wait::wait(event); - } - } - } //############################################################################# //! The CPU device thread wait specialization. //! @@ -59,12 +36,7 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; - // Get all the queues on the device at the time of invocation. - // All queues added afterwards are ignored. - auto vspQueues( - dev.m_spDevCpuImpl->GetAllQueues()); - - detail::currentThreadWaitForDevice(dev, vspQueues); + generic::currentThreadWaitForDevice(dev); } }; } diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp index d62798e70f..0870f090a4 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp index d3d259eeca..35c7478bf9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp index 39736a61ec..5cc544889a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp index 62b19ecd3d..56d18b1100 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp index 9df9c1400a..0303e9f719 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp @@ -1,6 +1,6 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera +/* Copyright 2020 Jeffrey Kelling * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -9,487 +9,13 @@ #pragma once -#include -#include +#include #include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - #include -#endif namespace alpaka { namespace event { - namespace cpu - { - namespace detail - { - //############################################################################# - //! The CPU device event implementation. - class EventCpuImpl final : public concepts::Implements - { - public: - //----------------------------------------------------------------------------- - EventCpuImpl( - dev::DevCpu const & dev) noexcept : - m_dev(dev), - m_mutex(), - m_enqueueCount(0u), - m_LastReadyEnqueueCount(0u) - {} - //----------------------------------------------------------------------------- - EventCpuImpl(EventCpuImpl const &) = delete; - //----------------------------------------------------------------------------- - EventCpuImpl(EventCpuImpl &&) = delete; - //----------------------------------------------------------------------------- - auto operator=(EventCpuImpl const &) -> EventCpuImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(EventCpuImpl &&) -> EventCpuImpl & = delete; - //----------------------------------------------------------------------------- - ~EventCpuImpl() noexcept = default; - - //----------------------------------------------------------------------------- - auto isReady() noexcept -> bool - { - return (m_LastReadyEnqueueCount == m_enqueueCount); - } - - //----------------------------------------------------------------------------- - auto wait(std::size_t const & enqueueCount, std::unique_lock& lk) const noexcept -> void - { - ALPAKA_ASSERT(enqueueCount <= m_enqueueCount); - - while(enqueueCount > m_LastReadyEnqueueCount) - { - auto future = m_future; - lk.unlock(); - future.get(); - lk.lock(); - } - } - - public: - dev::DevCpu const m_dev; //!< The device this event is bound to. - - std::mutex mutable m_mutex; //!< The mutex used to synchronize access to the event. - std::shared_future m_future; //!< The future signaling the event completion. - std::size_t m_enqueueCount; //!< The number of times this event has been enqueued. - std::size_t m_LastReadyEnqueueCount; //!< The time this event has been ready the last time. - //!< Ready means that the event was not waiting within a queue (not enqueued or already completed). - //!< If m_enqueueCount == m_LastReadyEnqueueCount, the event is currently not enqueued - }; - } - } - - //############################################################################# - //! The CPU device event. - class EventCpu final : public concepts::Implements - { - public: - //----------------------------------------------------------------------------- - //! \param bBusyWaiting Unused. EventCpu never does busy waiting. - EventCpu( - dev::DevCpu const & dev, - bool bBusyWaiting = true) : - m_spEventImpl(std::make_shared(dev)) - { - alpaka::ignore_unused(bBusyWaiting); - } - //----------------------------------------------------------------------------- - EventCpu(EventCpu const &) = default; - //----------------------------------------------------------------------------- - EventCpu(EventCpu &&) = default; - //----------------------------------------------------------------------------- - auto operator=(EventCpu const &) -> EventCpu & = default; - //----------------------------------------------------------------------------- - auto operator=(EventCpu &&) -> EventCpu & = default; - //----------------------------------------------------------------------------- - auto operator==(EventCpu const & rhs) const - -> bool - { - return (m_spEventImpl == rhs.m_spEventImpl); - } - //----------------------------------------------------------------------------- - auto operator!=(EventCpu const & rhs) const - -> bool - { - return !((*this) == rhs); - } - //----------------------------------------------------------------------------- - ~EventCpu() = default; - - public: - std::shared_ptr m_spEventImpl; - }; - } - - namespace dev - { - namespace traits - { - //############################################################################# - //! The CPU device event device get trait specialization. - template<> - struct GetDev< - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto getDev( - event::EventCpu const & event) - -> dev::DevCpu - { - return event.m_spEventImpl->m_dev; - } - }; - } - } - namespace event - { - namespace traits - { - //############################################################################# - //! The CPU device event test trait specialization. - template<> - struct Test< - event::EventCpu> - { - //----------------------------------------------------------------------------- - //! \return If the event is not waiting within a queue (not enqueued or already handled). - ALPAKA_FN_HOST static auto test( - event::EventCpu const & event) - -> bool - { - std::lock_guard lk(event.m_spEventImpl->m_mutex); - - return event.m_spEventImpl->isReady(); - } - }; - } - } - namespace queue - { - namespace traits - { - //############################################################################# - //! The CPU non-blocking device queue enqueue trait specialization. - template<> - struct Enqueue< - queue::cpu::detail::QueueCpuNonBlockingImpl, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl, -#else - queue::cpu::detail::QueueCpuNonBlockingImpl &, -#endif - event::EventCpu & event) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Copy the shared pointer of the event implementation. - // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued. - auto spEventImpl(event.m_spEventImpl); - - // Setting the event state and enqueuing it has to be atomic. - std::lock_guard lk(spEventImpl->m_mutex); - - ++spEventImpl->m_enqueueCount; - -// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - auto const enqueueCount = spEventImpl->m_enqueueCount; - - // Enqueue a task that only resets the events flag if it is completed. - spEventImpl->m_future = queueImpl.m_workerThread.enqueueTask( - [spEventImpl, enqueueCount]() - { - std::unique_lock lk2(spEventImpl->m_mutex); - - // Nothing to do if it has been re-enqueued to a later position in the queue. - if(enqueueCount == spEventImpl->m_enqueueCount) - { - spEventImpl->m_LastReadyEnqueueCount = spEventImpl->m_enqueueCount; - } - }); -#endif - } - }; - //############################################################################# - //! The CPU non-blocking device queue enqueue trait specialization. - template<> - struct Enqueue< - queue::QueueCpuNonBlocking, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( - queue::QueueCpuNonBlocking & queue, - event::EventCpu & event) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - queue::enqueue(*queue.m_spQueueImpl, event); - } - }; - //############################################################################# - //! The CPU blocking device queue enqueue trait specialization. - template<> - struct Enqueue< - queue::cpu::detail::QueueCpuBlockingImpl, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( - queue::cpu::detail::QueueCpuBlockingImpl & queueImpl, - event::EventCpu & event) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - std::promise promise; - { - std::lock_guard lk(queueImpl.m_mutex); - - queueImpl.m_bCurrentlyExecutingTask = true; - - auto & eventImpl(*event.m_spEventImpl); - - { - // Setting the event state and enqueuing it has to be atomic. - std::lock_guard evLk(eventImpl.m_mutex); - - ++eventImpl.m_enqueueCount; - // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing. - eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount; - - eventImpl.m_future = promise.get_future(); - } - - queueImpl.m_bCurrentlyExecutingTask = false; - } - promise.set_value(); - } - }; - //############################################################################# - //! The CPU blocking device queue enqueue trait specialization. - template<> - struct Enqueue< - queue::QueueCpuBlocking, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( - queue::QueueCpuBlocking & queue, - event::EventCpu & event) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - queue::enqueue(*queue.m_spQueueImpl, event); - } - }; - } - } - namespace wait - { - namespace traits - { - //############################################################################# - //! The CPU device event thread wait trait specialization. - //! - //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed. - //! If the event is not enqueued to a queue the method returns immediately. - template<> - struct CurrentThreadWaitFor< - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - event::EventCpu const & event) - -> void - { - wait::wait(*event.m_spEventImpl); - } - }; - //############################################################################# - //! The CPU device event implementation thread wait trait specialization. - //! - //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed. - //! If the event is not enqueued to a queue the method returns immediately. - //! - //! NOTE: This method is for internal usage only. - template<> - struct CurrentThreadWaitFor< - event::cpu::detail::EventCpuImpl> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - event::cpu::detail::EventCpuImpl const & eventImpl) - -> void - { - std::unique_lock lk(eventImpl.m_mutex); - - auto const enqueueCount = eventImpl.m_enqueueCount; - eventImpl.wait(enqueueCount, lk); - } - }; - //############################################################################# - //! The CPU non-blocking device queue event wait trait specialization. - template<> - struct WaiterWaitFor< - queue::cpu::detail::QueueCpuNonBlockingImpl, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto waiterWaitFor( -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl, -#else - queue::cpu::detail::QueueCpuNonBlockingImpl &, -#endif - event::EventCpu const & event) - -> void - { - // Copy the shared pointer of the event implementation. - // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued. - auto spEventImpl(event.m_spEventImpl); - - std::lock_guard lk(spEventImpl->m_mutex); - - if(!spEventImpl->isReady()) - { -// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - auto const enqueueCount = spEventImpl->m_enqueueCount; - - // Enqueue a task that waits for the given event. - queueImpl.m_workerThread.enqueueTask( - [spEventImpl, enqueueCount]() - { - std::unique_lock lk2(spEventImpl->m_mutex); - spEventImpl->wait(enqueueCount, lk2); - }); -#endif - } - } - }; - //############################################################################# - //! The CPU non-blocking device queue event wait trait specialization. - template<> - struct WaiterWaitFor< - queue::QueueCpuNonBlocking, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto waiterWaitFor( - queue::QueueCpuNonBlocking & queue, - event::EventCpu const & event) - -> void - { - wait::wait(*queue.m_spQueueImpl, event); - } - }; - //############################################################################# - //! The CPU blocking device queue event wait trait specialization. - template<> - struct WaiterWaitFor< - queue::cpu::detail::QueueCpuBlockingImpl, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto waiterWaitFor( - queue::cpu::detail::QueueCpuBlockingImpl & queueImpl, - event::EventCpu const & event) - -> void - { - alpaka::ignore_unused(queueImpl); - - // NOTE: Difference to non-blocking version: directly wait for event. - wait::wait(*event.m_spEventImpl); - } - }; - //############################################################################# - //! The CPU blocking device queue event wait trait specialization. - template<> - struct WaiterWaitFor< - queue::QueueCpuBlocking, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto waiterWaitFor( - queue::QueueCpuBlocking & queue, - event::EventCpu const & event) - -> void - { - wait::wait(*queue.m_spQueueImpl, event); - } - }; - //############################################################################# - //! The CPU non-blocking device event wait trait specialization. - //! - //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution. - template<> - struct WaiterWaitFor< - dev::DevCpu, - event::EventCpu> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto waiterWaitFor( - dev::DevCpu & dev, - event::EventCpu const & event) - -> void - { - // Get all the queues on the device at the time of invocation. - // All queues added afterwards are ignored. - auto vspQueues( - dev.m_spDevCpuImpl->GetAllQueues()); - - // Let all the queues wait for this event. - // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events! - for(auto && spQueue : vspQueues) - { - spQueue->wait(event); - } - } - }; - - //############################################################################# - //! The CPU non-blocking device queue thread wait trait specialization. - //! - //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) - template<> - struct CurrentThreadWaitFor< - queue::QueueCpuNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - queue::QueueCpuNonBlocking const & queue) - -> void - { - event::EventCpu event( - dev::getDev(queue)); - queue::enqueue( - const_cast(queue), - event); - wait::wait( - event); - } - }; - } + using EventCpu = EventGenericThreads; } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp new file mode 100644 index 0000000000..3e952d3347 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp @@ -0,0 +1,537 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + #include +#endif + +namespace alpaka +{ + namespace event + { + namespace generic + { + namespace detail + { + //############################################################################# + //! The CPU device event implementation. + template< + typename TDev> + class EventGenericThreadsImpl final : public concepts::Implements> + { + public: + //----------------------------------------------------------------------------- + EventGenericThreadsImpl( + TDev const & dev) noexcept : + m_dev(dev), + m_mutex(), + m_enqueueCount(0u), + m_LastReadyEnqueueCount(0u) + {} + //----------------------------------------------------------------------------- + EventGenericThreadsImpl(EventGenericThreadsImpl const &) = delete; + //----------------------------------------------------------------------------- + EventGenericThreadsImpl(EventGenericThreadsImpl &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(EventGenericThreadsImpl const &) -> EventGenericThreadsImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(EventGenericThreadsImpl &&) -> EventGenericThreadsImpl & = delete; + //----------------------------------------------------------------------------- + ~EventGenericThreadsImpl() noexcept = default; + + //----------------------------------------------------------------------------- + auto isReady() noexcept -> bool + { + return (m_LastReadyEnqueueCount == m_enqueueCount); + } + + //----------------------------------------------------------------------------- + auto wait(std::size_t const & enqueueCount, std::unique_lock& lk) const noexcept -> void + { + ALPAKA_ASSERT(enqueueCount <= m_enqueueCount); + + while(enqueueCount > m_LastReadyEnqueueCount) + { + auto future = m_future; + lk.unlock(); + future.get(); + lk.lock(); + } + } + + public: + TDev const m_dev; //!< The device this event is bound to. + + std::mutex mutable m_mutex; //!< The mutex used to synchronize access to the event. + std::shared_future m_future; //!< The future signaling the event completion. + std::size_t m_enqueueCount; //!< The number of times this event has been enqueued. + std::size_t m_LastReadyEnqueueCount; //!< The time this event has been ready the last time. + //!< Ready means that the event was not waiting within a queue (not enqueued or already completed). + //!< If m_enqueueCount == m_LastReadyEnqueueCount, the event is currently not enqueued + }; + } + } + + //############################################################################# + //! The CPU device event. + template< + typename TDev> + class EventGenericThreads final + : public concepts::Implements> + , public concepts::Implements> + { + public: + //----------------------------------------------------------------------------- + //! \param bBusyWaiting Unused. EventGenericThreads never does busy waiting. + EventGenericThreads( + TDev const & dev, + bool bBusyWaiting = true) : + m_spEventImpl(std::make_shared>(dev)) + { + alpaka::ignore_unused(bBusyWaiting); + } + //----------------------------------------------------------------------------- + EventGenericThreads(EventGenericThreads const &) = default; + //----------------------------------------------------------------------------- + EventGenericThreads(EventGenericThreads &&) = default; + //----------------------------------------------------------------------------- + auto operator=(EventGenericThreads const &) -> EventGenericThreads & = default; + //----------------------------------------------------------------------------- + auto operator=(EventGenericThreads &&) -> EventGenericThreads & = default; + //----------------------------------------------------------------------------- + auto operator==(EventGenericThreads const & rhs) const + -> bool + { + return (m_spEventImpl == rhs.m_spEventImpl); + } + //----------------------------------------------------------------------------- + auto operator!=(EventGenericThreads const & rhs) const + -> bool + { + return !((*this) == rhs); + } + //----------------------------------------------------------------------------- + ~EventGenericThreads() = default; + + public: + std::shared_ptr> m_spEventImpl; + }; + } + + namespace dev + { + namespace traits + { + //############################################################################# + //! The CPU device event device get trait specialization. + template + struct GetDev< + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getDev( + event::EventGenericThreads const & event) + -> TDev + { + return event.m_spEventImpl->m_dev; + } + }; + } + } + namespace event + { + namespace traits + { + //############################################################################# + //! The CPU device event test trait specialization. + template + struct Test< + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + //! \return If the event is not waiting within a queue (not enqueued or already handled). + ALPAKA_FN_HOST static auto test( + event::EventGenericThreads const & event) + -> bool + { + std::lock_guard lk(event.m_spEventImpl->m_mutex); + + return event.m_spEventImpl->isReady(); + } + }; + } + } + namespace queue + { + namespace traits + { + //############################################################################# + //! The CPU non-blocking device queue enqueue trait specialization. + template< + typename TDev> + struct Enqueue< + queue::generic::detail::QueueGenericThreadsNonBlockingImpl, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::generic::detail::QueueGenericThreadsNonBlockingImpl & queueImpl, + event::EventGenericThreads & event) + -> void + { +#if (BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) + alpaka::ignore_unused(queueImpl); +#endif + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Copy the shared pointer of the event implementation. + // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued. + auto spEventImpl(event.m_spEventImpl); + + // Setting the event state and enqueuing it has to be atomic. + std::lock_guard lk(spEventImpl->m_mutex); + + ++spEventImpl->m_enqueueCount; + +// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. +#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) + auto const enqueueCount = spEventImpl->m_enqueueCount; + + // Enqueue a task that only resets the events flag if it is completed. + spEventImpl->m_future = queueImpl.m_workerThread.enqueueTask( + [spEventImpl, enqueueCount]() + { + std::unique_lock lk2(spEventImpl->m_mutex); + + // Nothing to do if it has been re-enqueued to a later position in the queue. + if(enqueueCount == spEventImpl->m_enqueueCount) + { + spEventImpl->m_LastReadyEnqueueCount = spEventImpl->m_enqueueCount; + } + }); +#endif + } + }; + //############################################################################# + //! The CPU non-blocking device queue enqueue trait specialization. + template< + typename TDev> + struct Enqueue< + queue::QueueGenericThreadsNonBlocking, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::QueueGenericThreadsNonBlocking & queue, + event::EventGenericThreads & event) + -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + queue::enqueue(*queue.m_spQueueImpl, event); + } + }; + //############################################################################# + //! The CPU blocking device queue enqueue trait specialization. + template< + typename TDev> + struct Enqueue< + queue::generic::detail::QueueGenericThreadsBlockingImpl, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::generic::detail::QueueGenericThreadsBlockingImpl & queueImpl, + event::EventGenericThreads & event) + -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + std::promise promise; + { + std::lock_guard lk(queueImpl.m_mutex); + + queueImpl.m_bCurrentlyExecutingTask = true; + + auto & eventImpl(*event.m_spEventImpl); + + { + // Setting the event state and enqueuing it has to be atomic. + std::lock_guard evLk(eventImpl.m_mutex); + + ++eventImpl.m_enqueueCount; + // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing. + eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount; + + eventImpl.m_future = promise.get_future(); + } + + queueImpl.m_bCurrentlyExecutingTask = false; + } + promise.set_value(); + } + }; + //############################################################################# + //! The CPU blocking device queue enqueue trait specialization. + template< + typename TDev> + struct Enqueue< + queue::QueueGenericThreadsBlocking, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::QueueGenericThreadsBlocking & queue, + event::EventGenericThreads & event) + -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + queue::enqueue(*queue.m_spQueueImpl, event); + } + }; + } + } + namespace wait + { + namespace traits + { + namespace generic + { + template + ALPAKA_FN_HOST auto currentThreadWaitForDevice( + TDev const & dev + ) + ->void + { + // Get all the queues on the device at the time of invocation. + // All queues added afterwards are ignored. + auto vQueues(dev.getAllQueues()); + // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events! + std::vector> vEvents; + for(auto && spQueue : vQueues) + { + vEvents.emplace_back(dev); + spQueue->enqueue(vEvents.back()); + } + + // Now wait for all the events. + for(auto && event : vEvents) + { + wait::wait(event); + } + } + } + + //############################################################################# + //! The CPU device event thread wait trait specialization. + //! + //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed. + //! If the event is not enqueued to a queue the method returns immediately. + template + struct CurrentThreadWaitFor< + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto currentThreadWaitFor( + event::EventGenericThreads const & event) + -> void + { + wait::wait(*event.m_spEventImpl); + } + }; + //############################################################################# + //! The CPU device event implementation thread wait trait specialization. + //! + //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed. + //! If the event is not enqueued to a queue the method returns immediately. + //! + //! NOTE: This method is for internal usage only. + template + struct CurrentThreadWaitFor< + event::generic::detail::EventGenericThreadsImpl> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto currentThreadWaitFor( + event::generic::detail::EventGenericThreadsImpl const & eventImpl) + -> void + { + std::unique_lock lk(eventImpl.m_mutex); + + auto const enqueueCount = eventImpl.m_enqueueCount; + eventImpl.wait(enqueueCount, lk); + } + }; + //############################################################################# + //! The CPU non-blocking device queue event wait trait specialization. + template< + typename TDev> + struct WaiterWaitFor< + queue::generic::detail::QueueGenericThreadsNonBlockingImpl, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto waiterWaitFor( +#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) + queue::generic::detail::QueueGenericThreadsNonBlockingImpl & queueImpl, +#else + queue::generic::detail::QueueGenericThreadsNonBlockingImpl &, +#endif + event::EventGenericThreads const & event) + -> void + { + // Copy the shared pointer of the event implementation. + // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued. + auto spEventImpl(event.m_spEventImpl); + + std::lock_guard lk(spEventImpl->m_mutex); + + if(!spEventImpl->isReady()) + { +// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. +#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) + auto const enqueueCount = spEventImpl->m_enqueueCount; + + // Enqueue a task that waits for the given event. + queueImpl.m_workerThread.enqueueTask( + [spEventImpl, enqueueCount]() + { + std::unique_lock lk2(spEventImpl->m_mutex); + spEventImpl->wait(enqueueCount, lk2); + }); +#endif + } + } + }; + //############################################################################# + //! The CPU non-blocking device queue event wait trait specialization. + template< + typename TDev> + struct WaiterWaitFor< + queue::QueueGenericThreadsNonBlocking, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto waiterWaitFor( + queue::QueueGenericThreadsNonBlocking & queue, + event::EventGenericThreads const & event) + -> void + { + wait::wait(*queue.m_spQueueImpl, event); + } + }; + //############################################################################# + //! The CPU blocking device queue event wait trait specialization. + template< + typename TDev> + struct WaiterWaitFor< + queue::generic::detail::QueueGenericThreadsBlockingImpl, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto waiterWaitFor( + queue::generic::detail::QueueGenericThreadsBlockingImpl & queueImpl, + event::EventGenericThreads const & event) + -> void + { + alpaka::ignore_unused(queueImpl); + + // NOTE: Difference to non-blocking version: directly wait for event. + wait::wait(*event.m_spEventImpl); + } + }; + //############################################################################# + //! The CPU blocking device queue event wait trait specialization. + template< + typename TDev> + struct WaiterWaitFor< + queue::QueueGenericThreadsBlocking, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto waiterWaitFor( + queue::QueueGenericThreadsBlocking & queue, + event::EventGenericThreads const & event) + -> void + { + wait::wait(*queue.m_spQueueImpl, event); + } + }; + //############################################################################# + //! The CPU non-blocking device event wait trait specialization. + //! + //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution. + template + struct WaiterWaitFor< + TDev, + event::EventGenericThreads> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto waiterWaitFor( + TDev & dev, + event::EventGenericThreads const & event) + -> void + { + // Get all the queues on the device at the time of invocation. + // All queues added afterwards are ignored. + auto vspQueues( + dev.getAllQueues()); + + // Let all the queues wait for this event. + // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events! + for(auto && spQueue : vspQueues) + { + spQueue->wait(event); + } + } + }; + + //############################################################################# + //! The CPU non-blocking device queue thread wait trait specialization. + //! + //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) + template< + typename TDev> + struct CurrentThreadWaitFor< + queue::QueueGenericThreadsNonBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto currentThreadWaitFor( + queue::QueueGenericThreadsNonBlocking const & queue) + -> void + { + event::EventGenericThreads event( + dev::getDev(queue)); + queue::enqueue( + const_cast &>(queue), + event); + wait::wait( + event); + } + }; + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp index 336b77a9ed..2020aa8a09 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -112,7 +112,9 @@ namespace alpaka //############################################################################# //! The CUDA/HIP RT device event. - class EventUniformCudaHipRt final : public concepts::Implements + class EventUniformCudaHipRt final + : public concepts::Implements + , public concepts::Implements { public: //----------------------------------------------------------------------------- diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp index a828b4094c..b147d66ef3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp b/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp new file mode 100644 index 0000000000..1441935458 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp @@ -0,0 +1,53 @@ +/* Copyright 2020 Jeffrey Kelling + * + * This file exemplifies usage of alpaka. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR + * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#pragma once + +namespace alpaka +{ + namespace example + { + //! Alias for the default accelerator used by examples. From a list of + //! all accelerators the first one which is enabled is chosen. + //! AccCpuSerial is selected last. + template +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccGpuCudaRt; +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccGpuHipRt; +#elif defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuOmp2Blocks; +#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuTbbBlocks; +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuFibers; +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuOmp2Threads; +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuThreads; +#elif defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuOmp4; +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + using ExampleDefaultAcc = alpaka::acc::AccCpuSerial; +#else + class ExampleDefaultAcc; + #warning "No supported backend selected." +#endif + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp index 98de7f9227..7895917a50 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp index 9b8dfb3607..b13f0034dc 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -22,8 +22,6 @@ #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp index aa85ec4409..ee354b0976 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp index ece1c30f70..9a5a25dc0d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp index ba14dd111e..09e71fe71b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp index c61fbcc3c0..2c8b094d7c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp index e43d83a672..32c5c0ab55 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp index 01145ce104..ac8fe43ad3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp index 7ae7c4fee5..fab302a831 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp index 1bf7d9426f..d9644d638f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp index 483f8b9a13..ccd8581505 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp new file mode 100644 index 0000000000..c180dccf64 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp @@ -0,0 +1,151 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include + +#include + +#if BOOST_COMP_MSVC +#include +#endif + +namespace alpaka +{ + namespace intrinsic + { + //############################################################################# + //! The CPU intrinsic. + class IntrinsicCpu : public concepts::Implements + { + public: + //----------------------------------------------------------------------------- + IntrinsicCpu() = default; + //----------------------------------------------------------------------------- + IntrinsicCpu(IntrinsicCpu const &) = delete; + //----------------------------------------------------------------------------- + IntrinsicCpu(IntrinsicCpu &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(IntrinsicCpu const &) -> IntrinsicCpu & = delete; + //----------------------------------------------------------------------------- + auto operator=(IntrinsicCpu &&) -> IntrinsicCpu & = delete; + //----------------------------------------------------------------------------- + ~IntrinsicCpu() = default; + }; + + namespace traits + { + //############################################################################# + template<> + struct Popcount< + IntrinsicCpu> + { + //----------------------------------------------------------------------------- + static auto popcount( + intrinsic::IntrinsicCpu const & /*intrinsic*/, + std::uint32_t value) + -> std::int32_t + { +#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL + return __builtin_popcount(value); +#elif BOOST_COMP_MSVC + return __popcnt(value); +#else + // Fallback to standard library + return static_cast(std::bitset<32>(value).count()); +#endif + } + + //----------------------------------------------------------------------------- + static auto popcount( + intrinsic::IntrinsicCpu const & /*intrinsic*/, + std::uint64_t value) + -> std::int32_t + { +#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL + return __builtin_popcountll(value); +#elif BOOST_COMP_MSVC + return static_cast(__popcnt64(value)); +#else + // Fallback to standard library + return static_cast(std::bitset<64>(value).count()); +#endif + } + }; + + //############################################################################# + template<> + struct Ffs< + IntrinsicCpu> + { + //----------------------------------------------------------------------------- + static auto ffs( + intrinsic::IntrinsicCpu const & /*intrinsic*/, + std::int32_t value) + -> std::int32_t + { +#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL + return __builtin_ffs(value); +#elif BOOST_COMP_MSVC + // Implementation based on + // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a + unsigned long index = 0u; + if (_BitScanForward(&index, value) != 0) + return static_cast(index + 1u); + else + return 0; +#else + return ffsFallback(value); +#endif + } + + //----------------------------------------------------------------------------- + static auto ffs( + intrinsic::IntrinsicCpu const & /*intrinsic*/, + std::int64_t value) + -> std::int32_t + { +#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL + return __builtin_ffsll(value); +#elif BOOST_COMP_MSVC + // Implementation based on + // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a + unsigned long index = 0u; + if (_BitScanForward64(&index, value) != 0) + return static_cast(index + 1u); + else + return 0; +#else + return ffsFallback(value); +#endif + } + private: + + //----------------------------------------------------------------------------- + template< + typename TValue> + static auto ffsFallback(TValue value) + -> std::int32_t + { + if (value == 0) + return 0; + std::int32_t result = 1; + while ((value & 1) == 0) + { + value >>= 1; + result++; + } + return result; + } + }; + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp new file mode 100644 index 0000000000..b857951787 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp @@ -0,0 +1,110 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) + +#include + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA + #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP + #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +#include + +namespace alpaka +{ + namespace intrinsic + { + //############################################################################# + //! The GPU CUDA/HIP intrinsic. + class IntrinsicUniformCudaHipBuiltIn : public concepts::Implements + { + public: + //----------------------------------------------------------------------------- + IntrinsicUniformCudaHipBuiltIn() = default; + //----------------------------------------------------------------------------- + __device__ IntrinsicUniformCudaHipBuiltIn(IntrinsicUniformCudaHipBuiltIn const &) = delete; + //----------------------------------------------------------------------------- + __device__ IntrinsicUniformCudaHipBuiltIn(IntrinsicUniformCudaHipBuiltIn &&) = delete; + //----------------------------------------------------------------------------- + __device__ auto operator=(IntrinsicUniformCudaHipBuiltIn const &) -> IntrinsicUniformCudaHipBuiltIn & = delete; + //----------------------------------------------------------------------------- + __device__ auto operator=(IntrinsicUniformCudaHipBuiltIn &&) -> IntrinsicUniformCudaHipBuiltIn & = delete; + //----------------------------------------------------------------------------- + ~IntrinsicUniformCudaHipBuiltIn() = default; + }; + + namespace traits + { + //############################################################################# + template<> + struct Popcount< + IntrinsicUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto popcount( + intrinsic::IntrinsicUniformCudaHipBuiltIn const & /*intrinsic*/, + std::uint32_t value) + -> std::int32_t + { +#if BOOST_COMP_CLANG && BOOST_LANG_CUDA + return __popc(static_cast(value)); +#else + return __popc(static_cast(value)); +#endif + } + + //----------------------------------------------------------------------------- + __device__ static auto popcount( + intrinsic::IntrinsicUniformCudaHipBuiltIn const & /*intrinsic*/, + std::uint64_t value) + -> std::int32_t + { +#if BOOST_COMP_CLANG && BOOST_LANG_CUDA + return __popcll(static_cast(value)); +#else + return __popcll(static_cast(value)); +#endif + } + }; + + //############################################################################# + template<> + struct Ffs< + IntrinsicUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto ffs( + intrinsic::IntrinsicUniformCudaHipBuiltIn const & /*intrinsic*/, + std::int32_t value) + -> std::int32_t + { + return __ffs(static_cast(value)); + } + + //----------------------------------------------------------------------------- + __device__ static auto ffs( + intrinsic::IntrinsicUniformCudaHipBuiltIn const & /*intrinsic*/, + std::int64_t value) + -> std::int32_t + { + return __ffsll(static_cast(value)); + } + }; + } + } +} + +#endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp new file mode 100644 index 0000000000..6c054797a8 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp @@ -0,0 +1,135 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace alpaka +{ + //----------------------------------------------------------------------------- + //! The intrinsic specifics + namespace intrinsic + { + struct ConceptIntrinsic{}; + + //----------------------------------------------------------------------------- + //! The intrinsics traits. + namespace traits + { + //############################################################################# + //! The popcount trait. + template< + typename TWarp, + typename TSfinae = void> + struct Popcount; + + //############################################################################# + //! The ffs trait. + template< + typename TWarp, + typename TSfinae = void> + struct Ffs; + } + + //----------------------------------------------------------------------------- + //! Returns the number of 1 bits in the given 32-bit value. + //! + //! \tparam TIntrinsic The intrinsic implementation type. + //! \param intrinsic The intrinsic implementation. + //! \param value The input value. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TIntrinsic> + ALPAKA_FN_ACC auto popcount( + TIntrinsic const & intrinsic, + std::uint32_t value) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Popcount< + ImplementationBase> + ::popcount( + intrinsic, + value); + } + + //----------------------------------------------------------------------------- + //! Returns the number of 1 bits in the given 64-bit value. + //! + //! \tparam TIntrinsic The intrinsic implementation type. + //! \param intrinsic The intrinsic implementation. + //! \param value The input value. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TIntrinsic> + ALPAKA_FN_ACC auto popcount( + TIntrinsic const & intrinsic, + std::uint64_t value) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Popcount< + ImplementationBase> + ::popcount( + intrinsic, + value); + } + + //----------------------------------------------------------------------------- + //! Returns the 1-based position of the least significant bit set to 1 + //! in the given 32-bit value. Returns 0 for input value 0. + //! + //! \tparam TIntrinsic The intrinsic implementation type. + //! \param intrinsic The intrinsic implementation. + //! \param value The input value. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TIntrinsic> + ALPAKA_FN_ACC auto ffs( + TIntrinsic const & intrinsic, + std::int32_t value) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Ffs< + ImplementationBase> + ::ffs( + intrinsic, + value); + } + + //----------------------------------------------------------------------------- + //! Returns the 1-based position of the least significant bit set to 1 + //! in the given 64-bit value. Returns 0 for input value 0. + //! + //! \tparam TIntrinsic The intrinsic implementation type. + //! \param intrinsic The intrinsic implementation. + //! \param value The input value. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TIntrinsic> + ALPAKA_FN_ACC auto ffs( + TIntrinsic const & intrinsic, + std::int64_t value) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Ffs< + ImplementationBase> + ::ffs( + intrinsic, + value); + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp index 627311a69f..f72acc1ba8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -20,6 +20,7 @@ // Implementation details. #include +#include #include #include #include @@ -122,7 +123,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -152,7 +153,7 @@ namespace alpaka auto const boundGridBlockExecHost( meta::apply( - [this, &acc, &blockThreadExtent, &fiberPool](std::decay_t const & ... args) + [this, &acc, &blockThreadExtent, &fiberPool](ALPAKA_DECAY_T(TArgs) const & ... args) { // Bind the kernel and its arguments to the grid block function. return diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp index d935ffe8de..d73bf1bd9f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Bert Wesarg, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,6 +24,7 @@ // Implementation details. #include +#include #include #include #include @@ -100,7 +101,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -120,7 +121,7 @@ namespace alpaka // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor. auto const boundKernelFnObj( meta::apply( - [this](std::decay_t const & ... args) + [this](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp index 8e6c192786..8e06ead149 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,6 +24,7 @@ // Implementation details. #include +#include #include #include #include @@ -99,7 +100,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -119,7 +120,7 @@ namespace alpaka // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor. auto const boundKernelFnObj( meta::apply( - [this](std::decay_t const & ... args) + [this](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( @@ -162,6 +163,8 @@ namespace alpaka // Therefore we use 'omp parallel' with the specified number of threads in a block. #pragma omp parallel num_threads(iBlockThreadCount) { + // The guard is for gcc internal compiler error, as discussed in #735 +#if (!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)) #pragma omp single nowait { // The OpenMP runtime does not create a parallel region when only one thread is required in the num_threads clause. @@ -171,21 +174,13 @@ namespace alpaka throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!"); } - // GCC fails with: - // error: redeclaration of const int& iBlockThreadCount - // if(numThreads != iBlockThreadCount) - // ^ - // note: const int& iBlockThreadCount previously declared here - // #pragma omp parallel num_threads(iBlockThreadCount) - // ^ -#if (!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0)) int const numThreads(::omp_get_num_threads()); if(numThreads != iBlockThreadCount) { throw std::runtime_error("The OpenMP 2.0 runtime did not use the number of threads that had been required!"); } -#endif } +#endif boundKernelFnObj( acc); diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp index a10a059800..dcbfb96388 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,6 +24,7 @@ // Implementation details. #include +#include #include #include #include @@ -99,7 +100,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -119,7 +120,7 @@ namespace alpaka // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor. auto const boundKernelFnObj( meta::apply( - [this](std::decay_t const & ... args) + [this](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp index cfa96fb8f7..329d6248f9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -20,6 +20,7 @@ // Implementation details. #include +#include #include #include #include @@ -92,7 +93,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -112,7 +113,7 @@ namespace alpaka // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor. auto const boundKernelFnObj( meta::apply( - [this](std::decay_t const & ... args) + [this](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp index cf8a87ee65..347605425b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -20,6 +20,7 @@ // Implementation details. #include +#include #include #include #include @@ -98,7 +99,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -118,7 +119,7 @@ namespace alpaka // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor. auto const boundKernelFnObj( meta::apply( - [this](std::decay_t const & ... args) + [this](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp index 83ee9a9ff1..cd2915d8bb 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -20,6 +20,7 @@ // Implementation details. #include +#include #include #include #include @@ -121,7 +122,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -147,7 +148,7 @@ namespace alpaka // Bind the kernel and its arguments to the grid block function. auto const boundGridBlockExecHost( meta::apply( - [this, &acc, &blockThreadExtent, &threadPool](std::decay_t const & ... args) + [this, &acc, &blockThreadExtent, &threadPool](ALPAKA_DECAY_T(TArgs) const & ... args) { return std::bind( diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index b347c51239..a1c49658ad 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -38,6 +38,7 @@ // Implementation details. #include +#include #include #include #include @@ -342,7 +343,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -388,7 +389,7 @@ namespace alpaka // This forces the type of a float argument given with std::forward to this function to be of type float instead of e.g. "float const & __ptr64" (MSVC). // If not given by value, the kernel launch code does not copy the value but the pointer to the value location. meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) kernelName<<< @@ -487,7 +488,7 @@ namespace alpaka // Get the size of the block shared dynamic memory. auto const blockSharedMemDynSizeBytes( meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { return kernel::getBlockSharedMemDynSizeBytes< @@ -533,7 +534,7 @@ namespace alpaka // Enqueue the kernel execution. meta::apply( - [&](std::decay_t const & ... args) + [&](ALPAKA_DECAY_T(TArgs) const & ... args) { #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) kernelName<<< diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp index 5b5c4377c0..76e2912e49 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -156,8 +156,13 @@ namespace alpaka TKernelFnObj const &, TArgs const & ...) { +#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 + using Result = std::invoke_result_t; +#else + using Result = std::result_of_t; +#endif static_assert( - std::is_same, void>::value, + std::is_same::value, "The TKernelFnObj is required to return void!"); } }; diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp index eea098bcda..94690fef8f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp index ca0ebf3730..570b4fa391 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp index 58f9c06fbe..0f5aa82f78 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp index 7d8bd673d2..cd93b7b5bc 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp index 775ffbc158..ad745812c7 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp index d8bfb167a0..d25e7fec54 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp index 7f128e6e4c..0129e1140d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp index 4b683c802d..1d9448776e 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp index 7b89e1f64c..f8a615026c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp index 67c51dc438..cf001403af 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp index 7f3392d0a7..f5c48df184 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp index a1009a32f9..640b2ec4d8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp index 33d3255120..203f684887 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp index 950141b24b..0b9209b7e5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp index 24caeb147b..e7e4ccac6c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp index 93dcc92857..a25c4c1ede 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp index fbe77ab36b..2102981740 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp index 494ed4a1d2..9c83cf46ae 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp index 9eec2ddc95..c95e77168c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp index abe7048000..3786906cdc 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp index 73a3c96b0f..9e8e953fda 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp index 4d69d16366..447023630c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp index d069fd8153..5341113a5f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp index 983e42ee79..e10bb7a827 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp index 8309a04da3..9341d88c7b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp index 729898b450..4ed70c405a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp index fe9fb81328..b9cd4d735c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp index 47f06f15cc..0fbc4bcc34 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp index d7658d1890..20e1a0a3e6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp index 5d1c5ffcd9..f7d4b021ce 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp index 18b09d3b7e..6741e2ea17 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp index edf6461fa0..3da4f7d013 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp index 90fc152e62..332c806904 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp index 6834fa20bb..d5bb276296 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp index 38d31d0de5..4b978c9575 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp index cae03fd334..d79f1ec871 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp index 98fb337503..7996deae5b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp index e7c6ad4e6a..793e1ce77f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp index e28c371662..95edc03634 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp index 6c697a2150..e87c48d7d1 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp index b9d1f48205..e24e66991f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp index 100bf293f3..353bdeccb0 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp index c82ae48b75..c96d46f7be 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -71,6 +71,7 @@ namespace alpaka MaxUniformCudaHipBuiltIn const & max_ctx, Tx const & x, Ty const & y) + -> decltype(::max(x, y)) { alpaka::ignore_unused(max_ctx); return ::max(x, y); @@ -95,6 +96,7 @@ namespace alpaka MaxUniformCudaHipBuiltIn const & max_ctx, Tx const & x, Ty const & y) + -> decltype(::fmax(x, y)) { alpaka::ignore_unused(max_ctx); return ::fmax(x, y); diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp index 3f8ea16e40..ba074edf94 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp index 40b3ee47b2..7a2963c2c2 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp index b5b68cfc4e..b7851c6ed5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -72,6 +72,7 @@ namespace alpaka MinUniformCudaHipBuiltIn const & min_ctx, Tx const & x, Ty const & y) + -> decltype(::min(x, y)) { alpaka::ignore_unused(min_ctx); return ::min(x, y); @@ -96,6 +97,7 @@ namespace alpaka MinUniformCudaHipBuiltIn const & min_ctx, Tx const & x, Ty const & y) + -> decltype(::fmin(x, y)) { alpaka::ignore_unused(min_ctx); return ::fmin(x, y); diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp index da2026552d..1e98a414c3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp index 1595a53337..a74d80390b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp index 8af408352c..aa896d5f1c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp index 9b135798c4..cf6cc9939a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp index da1403b068..652fb1bd26 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp index 826405883e..fe8003792c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp index 449ac8e5ea..a5520ea580 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp index 788eca0c3a..2606694721 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp index b825f6240a..9eb9a24a98 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp index d4ba64636e..2a82dbbf7e 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -11,9 +11,6 @@ #include #include -#include - -#include #include diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp index a733eac1ab..5a890e99b2 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp index 9c804b1414..92f2fa6fb5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp index 064c684989..dc4a726d67 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp index c69f019761..339f3970b1 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp index 2c8c50f3b1..7409840332 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp index 55f251ae14..83d62d4e5a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp index a87d621673..fe65fbe73c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp index f4569c7980..f5bb59f33b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp index 7ec0e85aff..d6d6d108b8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp index 6e4602f145..d36fa7140d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp index cdb7271c53..52a5c39abb 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp index 8ea328d674..879fe4043d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp index 4e990c4f50..3405e70ca1 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp index 521857a50a..b881c47a91 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp index 9dd24489e2..20e6f92058 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp index 2930885ffc..60fccf318c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp index c063bc0727..407ff13490 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp index 1651499271..549f3e54be 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp similarity index 85% rename from thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp rename to thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp index cb1e8ba0af..be4470d864 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -11,11 +11,10 @@ #include +#include #include #include -#include - #include namespace alpaka @@ -32,7 +31,7 @@ namespace alpaka //! \tparam TAlignment An integral constant containing the alignment. template< typename TAlignment> - class AllocCpuBoostAligned : public concepts::Implements> + class AllocCpuAligned : public concepts::Implements> { }; @@ -45,11 +44,11 @@ namespace alpaka typename TAlignment> struct Alloc< T, - AllocCpuBoostAligned> + AllocCpuAligned> { //----------------------------------------------------------------------------- ALPAKA_FN_HOST static auto alloc( - AllocCpuBoostAligned const & alloc, + AllocCpuAligned const & alloc, std::size_t const & sizeElems) -> T * { @@ -70,7 +69,7 @@ namespace alpaka alpaka::ignore_unused(alloc); return reinterpret_cast( - boost::alignment::aligned_alloc(std::max(TAlignment::value, minAlignement), sizeElems * sizeof(T))); + core::alignedAlloc(std::max(TAlignment::value, minAlignement), sizeElems * sizeof(T))); } }; @@ -81,16 +80,16 @@ namespace alpaka typename TAlignment> struct Free< T, - AllocCpuBoostAligned> + AllocCpuAligned> { //----------------------------------------------------------------------------- ALPAKA_FN_HOST static auto free( - AllocCpuBoostAligned const & alloc, + AllocCpuAligned const & alloc, T const * const ptr) -> void { alpaka::ignore_unused(alloc); - boost::alignment::aligned_free( + core::alignedFree( const_cast( reinterpret_cast(ptr))); } diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp index badd8f5608..aaf554ea2b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp index c2b058c517..66116935f8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp index a3560b9060..43c85d877a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -25,7 +25,7 @@ #include #endif -#include +#include #include @@ -49,7 +49,7 @@ namespace alpaka typename TDim, typename TIdx> class BufCpuImpl final : - public mem::alloc::AllocCpuBoostAligned> + public mem::alloc::AllocCpuAligned> { static_assert( !std::is_const::value, @@ -64,7 +64,7 @@ namespace alpaka ALPAKA_FN_HOST BufCpuImpl( dev::DevCpu const & dev, TExtent const & extent) : - mem::alloc::AllocCpuBoostAligned>(), + mem::alloc::AllocCpuAligned>(), m_dev(dev), m_extentElements(extent::getExtentVecEnd(extent)), m_pMem(mem::alloc::alloc(*this, static_cast(computeElementCount(extent)))), diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp index e716c08186..4113c93f9b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -401,7 +401,7 @@ namespace alpaka void * memPtr = nullptr; - std::size_t pitchBytes = widthBytes; + std::size_t pitchBytes = 0u; #ifdef ALPAKA_ACC_GPU_HIP_ENABLED //FIXME: HIP cannot handle zero-size input (throws Unknown Error) if(width!=0 && height!=0) @@ -471,6 +471,7 @@ namespace alpaka ALPAKA_API_PREFIX(PitchedPtr) pitchedPtrVal; pitchedPtrVal.ptr = nullptr; #ifdef ALPAKA_ACC_GPU_HIP_ENABLED + pitchedPtrVal.pitch = 0u; //FIXME: HIP cannot handle zero-size input if(extentVal.width!=0 && extentVal.height!=0 diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp index 1fac7fd261..ec7d8ba9b3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Alexander Matthes, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -13,8 +13,6 @@ #include -#include - namespace alpaka { //----------------------------------------------------------------------------- @@ -101,13 +99,15 @@ namespace alpaka typename TElem, typename TDim, typename TIdx> - using Buf = typename traits::BufType::type; + using Buf = typename traits::BufType< + alpaka::dev::Dev, TElem, TDim, TIdx>::type; //----------------------------------------------------------------------------- //! Allocates memory on the given device. //! //! \tparam TElem The element type of the returned buffer. - //! \tparam TExtent The extent of the buffer. + //! \tparam TIdx The linear index type of the buffer. + //! \tparam TExtent The extent type of the buffer. //! \tparam TDev The type of device the buffer is allocated on. //! \param dev The device to allocate the buffer on. //! \param extent The extent of the buffer. diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp index 2d3d250a21..c3aa46e83f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, Rene Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp index 7f251f1458..74eea1fa7c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp index 3a40b9a4dd..a30cb1e227 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp @@ -1,6 +1,7 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, + * Rene Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -41,7 +42,8 @@ #include #include - +#include +#include namespace alpaka { @@ -53,6 +55,186 @@ namespace alpaka { namespace detail { + using vec3D = alpaka::vec::Vec, size_t>; + using vec2D = alpaka::vec::Vec, size_t>; + + ///! copy 3D memory + /// + /// It is required to start `height * depth` HIP/CUDA blocks. + /// The kernel loops over the memory rows. + template + __global__ void hipMemcpy3DEmulatedKernelD2D( + char * dstPtr, vec2D const dstPitch, + char const * srcPtr, vec2D const srcPitch, vec3D const extent + ) + { + constexpr size_t X = 2; + constexpr size_t Y = 1; + constexpr size_t Z = 0; + + // blockDim.[y,z] is always 1 and is not needed for index calculations + // gridDim and blockIdx is already in alpaka index order [z,y,x] +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + alpaka::vec::Vec, uint32_t> const tid( + blockIdx.x, + blockIdx.y, + blockIdx.z * blockDim.x + threadIdx.x); + + size_t const bytePerBlock = sizeof(T) * blockDim.x; + size_t const bytePerRow = gridDim.z * bytePerBlock; +#else + alpaka::vec::Vec, uint32_t> const tid( + hipBlockIdx_x, + hipBlockIdx_y, + hipBlockIdx_z * hipBlockDim_x + hipThreadIdx_x); + + size_t const bytePerBlock = sizeof(T) * hipBlockDim_x; + size_t const bytePerRow = hipGridDim_z * bytePerBlock; +#endif + + size_t const peelLoopSteps = extent[X] / bytePerRow; + bool const needRemainder = (extent[X] % bytePerRow) != 0; + + dstPtr += tid[Z] * dstPitch[Z] + tid[Y] * dstPitch[Y]; + srcPtr += tid[Z] * srcPitch[Z] + tid[Y] * srcPitch[Y]; + + for(size_t idx = 0; idx < peelLoopSteps; ++idx) + { + size_t const byteOffsetX = idx * bytePerRow + tid[X] * sizeof(T); + auto dst = reinterpret_cast(dstPtr + byteOffsetX); + auto src = reinterpret_cast(srcPtr + byteOffsetX); + *dst = *src; + } + if(needRemainder) + { + size_t const byteOffsetX = peelLoopSteps * bytePerRow + tid[X] * sizeof(T); + if(byteOffsetX < extent[X]) + { + auto dst = reinterpret_cast(dstPtr + byteOffsetX); + auto src = reinterpret_cast(srcPtr + byteOffsetX); + *dst = *src; + } + } + } + + inline size_t divUp(size_t const & x, size_t const & y) + { + return (x + y - 1u) / y; + } + + inline auto memcpy3DEmulatedD2DAsync(ALPAKA_API_PREFIX(Memcpy3DParms) const * const p, ALPAKA_API_PREFIX(Stream_t) stream) + { + using dim3Value_t = std::remove_reference_t().x)>; + // extent[2] is in byte + vec3D const extent(p->extent.depth, p->extent.height, p->extent.width); + // pitch in bytes + vec2D const dstPitch(p->dstPtr.pitch * p->dstPtr.ysize,p->dstPtr.pitch); + vec2D const srcPitch(p->srcPtr.pitch * p->srcPtr.ysize,p->srcPtr.pitch); + // offset[2] is in byte + vec3D const dstOffset(p->dstPos.z,p->dstPos.y,p->dstPos.x); + vec3D const srcOffset(p->srcPos.z,p->srcPos.y,p->srcPos.x); + + char const * srcPtr = + reinterpret_cast(p->srcPtr.ptr) + srcOffset[0] * srcPitch[0] + srcOffset[1] * srcPitch[1] + srcOffset[2]; + char * dstPtr = + reinterpret_cast(p->dstPtr.ptr) + dstOffset[0] * dstPitch[0] + dstOffset[1] * dstPitch[1] + dstOffset[2]; + + bool const use4Byte = (reinterpret_cast(srcPtr) % 4u) == 0u && (reinterpret_cast(dstPtr) % 4u) == 0u && (extent[2] % 4u) == 0u; + + if(use4Byte) + { + dim3 block(static_cast(std::min(divUp(extent[2], 4u), size_t(256u)))); + // use alpaka index order [z,y,x] because x is by default 1 + dim3 grid(static_cast(extent[0]), static_cast(extent[1]), 1u); + // for less than 100 blocks increase the number of blocks used to copy a row to + // increase the utilization of the device + if(grid.x * grid.y < 100) + { + grid.z = static_cast(divUp(divUp(extent[2], 4u), block.x)); + } + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + hipMemcpy3DEmulatedKernelD2D<<< + grid, block, 0, stream>>>( + dstPtr, dstPitch, srcPtr, srcPitch, extent); +#else + hipLaunchKernelGGL( + HIP_KERNEL_NAME(hipMemcpy3DEmulatedKernelD2D), + grid, block, 0, stream, + dstPtr, dstPitch, srcPtr, srcPitch, extent); +#endif + } + else + { + dim3 block(static_cast(std::min(extent[2], size_t(256u)))); + // use alpaka index order [z,y,x] because x is by default 1 + dim3 grid(static_cast(extent[0]), static_cast(extent[1]), 1u); + // for less than 100 blocks increase the number of blocks used to copy a row to + // increase the utilization of the device + if(grid.x * grid.y < 100) + { + grid.z = static_cast(divUp(extent[2], block.x)); + } +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + hipMemcpy3DEmulatedKernelD2D<<< + grid, block, 0, stream>>>( + dstPtr, dstPitch, srcPtr, srcPitch, extent); +#else + hipLaunchKernelGGL( + HIP_KERNEL_NAME(hipMemcpy3DEmulatedKernelD2D), + grid, block, 0, stream, + dstPtr, dstPitch, srcPtr, srcPitch, extent); +#endif + } + return ALPAKA_API_PREFIX(GetLastError)(); + }; + + //----------------------------------------------------------------------------- + //! Not being able to enable peer access does not prevent such device to device memory copies. + //! However, those copies may be slower because the memory is copied via the CPU. + inline auto enablePeerAccessIfPossible( + const int & devSrc, + const int & devDst) + -> void + { + ALPAKA_ASSERT(devSrc != devDst); + +#if BOOST_COMP_CLANG + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif + static std::set> alreadyCheckedPeerAccessDevices; +#if BOOST_COMP_CLANG + #pragma clang diagnostic pop +#endif + auto const devicePair = std::make_pair(devSrc, devDst); + + if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end()) + { + alreadyCheckedPeerAccessDevices.insert(devicePair); + + int canAccessPeer = 0; + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceCanAccessPeer)(&canAccessPeer, devSrc, devDst)); + + if(!canAccessPeer) { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + std::cout << __func__ + << " Direct peer access between given GPUs is not possible!" + << " src=" << devSrc + << " dst=" << devDst + << std::endl; +#endif + return; + } + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(devSrc)); + + // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is reset using cudaDeviceReset()." + // We do not remove a device from the enabled device pairs on cudaDeviceReset. + // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to cudaDeviceEnablePeerAccess() is required. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceEnablePeerAccess)(devDst, 0)); + } + } + //############################################################################# //! The CUDA/HIP memory copy trait. template< @@ -119,6 +301,55 @@ namespace alpaka #endif } + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + printDebug(); +#endif + if(m_extentWidthBytes == 0) + { + return; + } + + auto const & uniformCudaHipMemCpyKind(m_uniformMemCpyKind); + + if(m_iDstDevice == m_iSrcDevice) + { + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + m_iDstDevice)); + // Initiate the memory copy. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyAsync)( + m_dstMemNative, + m_srcMemNative, + static_cast(m_extentWidthBytes), + uniformCudaHipMemCpyKind, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + else + { + alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice); + + // Initiate the memory copy. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyPeerAsync)( + m_dstMemNative, + m_iDstDevice, + m_srcMemNative, + m_iSrcDevice, + static_cast(m_extentWidthBytes), + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + } + + private: #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL //----------------------------------------------------------------------------- ALPAKA_FN_HOST auto printDebug() const @@ -220,6 +451,94 @@ namespace alpaka #endif } + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + printDebug(); +#endif + // This is not only an optimization but also prevents a division by zero. + if(m_extentWidthBytes == 0 || m_extentHeight == 0) + { + return; + } + + if(m_iDstDevice == m_iSrcDevice) + { + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + m_iDstDevice)); + // Initiate the memory copy. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy2DAsync)( + m_dstMemNative, + static_cast(m_dstpitchBytesX), + m_srcMemNative, + static_cast(m_srcpitchBytesX), + static_cast(m_extentWidthBytes), + static_cast(m_extentHeight), + m_uniformMemCpyKind, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + else + { + alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice); +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync. + // Create the struct describing the copy. + ALPAKA_API_PREFIX(Memcpy3DPeerParms) const memCpy3DPeerParms( + buildCudaMemcpy3DPeerParms()); + // Initiate the memory copy. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + cudaMemcpy3DPeerAsync( + &memCpy3DPeerParms, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); +#endif + } + } + + private: +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms() const + -> cudaMemcpy3DPeerParms + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + + // Fill CUDA parameter structure. + cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms; + cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr. + cudaMemCpy3DPeerParms.dstDevice = m_iDstDevice; + cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. + cudaMemCpy3DPeerParms.dstPtr = + make_cudaPitchedPtr( + m_dstMemNative, + static_cast(m_dstpitchBytesX), + static_cast(m_dstWidth), + static_cast(m_dstPitchBytesY / m_dstpitchBytesX)); + cudaMemCpy3DPeerParms.extent = + make_cudaExtent( + static_cast(m_extentWidthBytes), + static_cast(m_extentHeight), + static_cast(1u)); + cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr. + cudaMemCpy3DPeerParms.srcDevice = m_iSrcDevice; + cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. + cudaMemCpy3DPeerParms.srcPtr = + make_cudaPitchedPtr( + const_cast(m_srcMemNative), + static_cast(m_srcpitchBytesX), + static_cast(m_srcWidth), + static_cast(m_srcPitchBytesY / m_srcpitchBytesX)); + + return cudaMemCpy3DPeerParms; + } +#endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL //----------------------------------------------------------------------------- ALPAKA_FN_HOST auto printDebug() const @@ -345,6 +664,142 @@ namespace alpaka #endif } + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + printDebug(); +#endif + // This is not only an optimization but also prevents a division by zero. + if(m_extentWidthBytes == 0 || m_extentHeight == 0 || m_extentDepth == 0) + { + return; + } + + if(m_iDstDevice == m_iSrcDevice) + { + // Create the struct describing the copy. + ALPAKA_API_PREFIX(Memcpy3DParms) const uniformCudaHipMemCpy3DParms( + buildUniformCudaHipMemcpy3DParms()); + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + m_iDstDevice)); +#if defined(ALPAKA_EMU_MEMCPY3D_ENABLED) + auto isDevice2DeviceCopy = m_uniformMemCpyKind == ALPAKA_API_PREFIX(MemcpyDeviceToDevice); + // contiguous memory can be copied by a single 1D memory copy + auto isContiguousMemory = uniformCudaHipMemCpy3DParms.extent.width == uniformCudaHipMemCpy3DParms.dstPtr.pitch && + uniformCudaHipMemCpy3DParms.extent.width == uniformCudaHipMemCpy3DParms.srcPtr.pitch; + if(isDevice2DeviceCopy && !isContiguousMemory) + { + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + uniform_cuda_hip::detail::memcpy3DEmulatedD2DAsync( + &uniformCudaHipMemCpy3DParms, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + else +#endif + { + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy3DAsync)( + &uniformCudaHipMemCpy3DParms, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + } + else + { + alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice); +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + // Create the struct describing the copy. + cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( + buildCudaMemcpy3DPeerParms()); + // Initiate the memory copy. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + cudaMemcpy3DPeerAsync( + &cudaMemCpy3DPeerParms, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); +#endif + } + } + + private: + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const + -> ALPAKA_API_PREFIX(Memcpy3DParms) + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + + // Fill CUDA/HIP parameter structure. + ALPAKA_API_PREFIX(Memcpy3DParms) memCpy3DParms; + memCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. + memCpy3DParms.srcPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. + memCpy3DParms.srcPtr = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( + const_cast(m_srcMemNative), + static_cast(m_srcpitchBytesX), + static_cast(m_srcWidth), + static_cast(m_srcPitchBytesY/m_srcpitchBytesX)); + memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. + memCpy3DParms.dstPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. + memCpy3DParms.dstPtr = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( + m_dstMemNative, + static_cast(m_dstpitchBytesX), + static_cast(m_dstWidth), + static_cast(m_dstPitchBytesY / m_dstpitchBytesX)); + memCpy3DParms.extent = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( + static_cast(m_extentWidthBytes), + static_cast(m_extentHeight), + static_cast(m_extentDepth)); +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_PLATFORM_NVCC__) + memCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(m_uniformMemCpyKind); +#else + memCpy3DParms.kind = m_uniformMemCpyKind; +#endif + return memCpy3DParms; + } + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms() const + -> cudaMemcpy3DPeerParms + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + + // Fill CUDA parameter structure. + cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms; + cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr. + cudaMemCpy3DPeerParms.dstDevice = m_iDstDevice; + cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. + cudaMemCpy3DPeerParms.dstPtr = + make_cudaPitchedPtr( + m_dstMemNative, + static_cast(m_dstpitchBytesX), + static_cast(m_dstWidth), + static_cast(m_dstPitchBytesY/m_dstpitchBytesX)); + cudaMemCpy3DPeerParms.extent = + make_cudaExtent( + static_cast(m_extentWidthBytes), + static_cast(m_extentHeight), + static_cast(m_extentDepth)); + cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr. + cudaMemCpy3DPeerParms.srcDevice = m_iSrcDevice; + cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. + cudaMemCpy3DPeerParms.srcPtr = + make_cudaPitchedPtr( + const_cast(m_srcMemNative), + static_cast(m_srcpitchBytesX), + static_cast(m_srcWidth), + static_cast(m_srcPitchBytesY / m_srcpitchBytesX)); + + return cudaMemCpy3DPeerParms; + } +#endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL //----------------------------------------------------------------------------- ALPAKA_FN_HOST auto printDebug() const @@ -396,52 +851,6 @@ namespace alpaka void * m_dstMemNative; void const * m_srcMemNative; }; - - //----------------------------------------------------------------------------- - //! Not being able to enable peer access does not prevent such device to device memory copies. - //! However, those copies may be slower because the memory is copied via the CPU. - inline auto enablePeerAccessIfPossible( - const int & devSrc, - const int & devDst) - -> void - { - ALPAKA_ASSERT(devSrc != devDst); - -#if BOOST_COMP_CLANG - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wexit-time-destructors" -#endif - static std::set> alreadyCheckedPeerAccessDevices; -#if BOOST_COMP_CLANG - #pragma clang diagnostic pop -#endif - auto const devicePair = std::make_pair(devSrc, devDst); - - if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end()) - { - alreadyCheckedPeerAccessDevices.insert(devicePair); - - int canAccessPeer = 0; - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceCanAccessPeer)(&canAccessPeer, devSrc, devDst)); - - if(!canAccessPeer) { -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - std::cout << __func__ - << " Direct peer access between given GPUs is not possible!" - << " src=" << devSrc - << " dst=" << devDst - << std::endl; -#endif - return; - } - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(devSrc)); - - // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is reset using cudaDeviceReset()." - // We do not remove a device from the enabled device pairs on cudaDeviceReset. - // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to cudaDeviceEnablePeerAccess() is required. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceEnablePeerAccess)(devDst, 0)); - } - } } } @@ -576,193 +985,6 @@ namespace alpaka } }; } - namespace uniform_cuda_hip - { - namespace detail - { - //----------------------------------------------------------------------------- - template< - typename TExtent, - typename TViewSrc, - typename TViewDst> - ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms( - mem::view::uniform_cuda_hip::detail::TaskCopyUniformCudaHip, TViewDst, TViewSrc, TExtent> const & task) - -> ALPAKA_API_PREFIX(Memcpy3DParms) - { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & dstWidth(task.m_dstWidth); - auto const & srcWidth(task.m_srcWidth); - - auto const & extentHeight(task.m_extentHeight); - //auto const & dstHeight(task.m_dstHeight); - //auto const & srcHeight(task.m_srcHeight); - - auto const & extentDepth(task.m_extentDepth); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - auto const & dstPitchBytesY(task.m_dstPitchBytesY); - auto const & srcPitchBytesY(task.m_srcPitchBytesY); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - // Fill CUDA/HIP parameter structure. - ALPAKA_API_PREFIX(Memcpy3DParms) memCpy3DParms; - memCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. - memCpy3DParms.srcPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. - memCpy3DParms.srcPtr = - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( - const_cast(srcNativePtr), - static_cast(srcPitchBytesX), - static_cast(srcWidth), - static_cast(srcPitchBytesY/srcPitchBytesX)); - memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. - memCpy3DParms.dstPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. - memCpy3DParms.dstPtr = - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth), - static_cast(dstPitchBytesY / dstPitchBytesX)); - memCpy3DParms.extent = - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( - static_cast(extentWidthBytes), - static_cast(extentHeight), - static_cast(extentDepth)); -#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_PLATFORM_NVCC__) - memCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(task.m_uniformMemCpyKind); -#else - memCpy3DParms.kind = task.m_uniformMemCpyKind; -#endif - return memCpy3DParms; - } -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - //----------------------------------------------------------------------------- - template< - typename TViewDst, - typename TViewSrc, - typename TExtent> - ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::TaskCopyUniformCudaHip, TViewDst, TViewSrc, TExtent> const & task) - -> cudaMemcpy3DPeerParms - { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & dstWidth(task.m_dstWidth); - auto const & srcWidth(task.m_srcWidth); - - auto const & extentHeight(task.m_extentHeight); - //auto const & dstHeight(task.m_dstHeight); - //auto const & srcHeight(task.m_srcHeight); - - auto const extentDepth(1u); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - auto const & dstPitchBytesY(task.m_dstPitchBytesY); - auto const & srcPitchBytesY(task.m_srcPitchBytesY); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - // Fill CUDA parameter structure. - cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms; - cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr. - cudaMemCpy3DPeerParms.dstDevice = iDstDev; - cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DPeerParms.dstPtr = - make_cudaPitchedPtr( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth), - static_cast(dstPitchBytesY / dstPitchBytesX)); - cudaMemCpy3DPeerParms.extent = - make_cudaExtent( - static_cast(extentWidthBytes), - static_cast(extentHeight), - static_cast(extentDepth)); - cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr. - cudaMemCpy3DPeerParms.srcDevice = iSrcDev; - cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DPeerParms.srcPtr = - make_cudaPitchedPtr( - const_cast(srcNativePtr), - static_cast(srcPitchBytesX), - static_cast(srcWidth), - static_cast(srcPitchBytesY / srcPitchBytesX)); - - return cudaMemCpy3DPeerParms; - } - //----------------------------------------------------------------------------- - template< - typename TViewDst, - typename TViewSrc, - typename TExtent> - ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::TaskCopyUniformCudaHip, TViewDst, TViewSrc, TExtent> const & task) - -> cudaMemcpy3DPeerParms - { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & dstWidth(task.m_dstWidth); - auto const & srcWidth(task.m_srcWidth); - - auto const & extentHeight(task.m_extentHeight); - //auto const & dstHeight(task.m_dstHeight); - //auto const & srcHeight(task.m_srcHeight); - - auto const & extentDepth(task.m_extentDepth); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - auto const & dstPitchBytesY(task.m_dstPitchBytesY); - auto const & srcPitchBytesY(task.m_srcPitchBytesY); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - // Fill CUDA parameter structure. - cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms; - cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr. - cudaMemCpy3DPeerParms.dstDevice = iDstDev; - cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DPeerParms.dstPtr = - make_cudaPitchedPtr( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth), - static_cast(dstPitchBytesY/dstPitchBytesX)); - cudaMemCpy3DPeerParms.extent = - make_cudaExtent( - static_cast(extentWidthBytes), - static_cast(extentHeight), - static_cast(extentDepth)); - cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr. - cudaMemCpy3DPeerParms.srcDevice = iSrcDev; - cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DPeerParms.srcPtr = - make_cudaPitchedPtr( - const_cast(srcNativePtr), - static_cast(srcPitchBytesX), - static_cast(srcWidth), - static_cast(srcPitchBytesY / srcPitchBytesX)); - - return cudaMemCpy3DPeerParms; - } -#endif - } - } } } namespace queue @@ -787,53 +1009,7 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - if(task.m_extentWidthBytes == 0) - { - return; - } - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - - auto const & extentWidthBytes(task.m_extentWidthBytes); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - auto const & uniformCudaHipMemCpyKind(task.m_uniformMemCpyKind); - - if(iDstDev == iSrcDev) - { - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemcpyAsync)( - dstNativePtr, - srcNativePtr, - static_cast(extentWidthBytes), - uniformCudaHipMemCpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemcpyPeerAsync)( - dstNativePtr, - iDstDev, - srcNativePtr, - iSrcDev, - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } + task.enqueue(queue); } }; //############################################################################# @@ -854,53 +1030,8 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - if(task.m_extentWidthBytes == 0) - { - return; - } - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - - auto const & extentWidthBytes(task.m_extentWidthBytes); + task.enqueue(queue); - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - if(iDstDev == iSrcDev) - { - auto const & uniformCudaHipMemCpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemcpyAsync)( - dstNativePtr, - srcNativePtr, - static_cast(extentWidthBytes), - uniformCudaHipMemCpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemcpyPeerAsync)( - dstNativePtr, - iDstDev, - srcNativePtr, - iSrcDev, - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); @@ -924,63 +1055,7 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - // This is not only an optimization but also prevents a division by zero. - if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0) - { - return; - } - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - - if(iDstDev == iSrcDev) - { - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & extentHeight(task.m_extentHeight); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - - auto const & memcpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memcpy2DAsync)( - dstNativePtr, - static_cast(dstPitchBytesX), - srcNativePtr, - static_cast(srcPitchBytesX), - static_cast(extentWidthBytes), - static_cast(extentHeight), - memcpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync. - // Create the struct describing the copy. - ALPAKA_API_PREFIX(Memcpy3DPeerParms) const memCpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( - task)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - cudaMemcpy3DPeerAsync( - &memCpy3DPeerParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif - } + task.enqueue(queue); } }; //############################################################################# @@ -1001,62 +1076,8 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - // This is not only an optimization but also prevents a division by zero. - if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0) - { - return; - } - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); + task.enqueue(queue); - if(iDstDev == iSrcDev) - { - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & extentHeight(task.m_extentHeight); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - auto const & memcpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memcpy2DAsync)( - dstNativePtr, - static_cast(dstPitchBytesX), - srcNativePtr, - static_cast(srcPitchBytesX), - static_cast(extentWidthBytes), - static_cast(extentHeight), - memcpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync. - // Create the struct describing the copy. - cudaMemcpy3DPeerParms const memCpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( - task)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - cudaMemcpy3DPeerAsync( - &memCpy3DPeerParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif - } ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); @@ -1079,49 +1100,8 @@ namespace alpaka -> void { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - // This is not only an optimization but also prevents a division by zero. - if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0) - { - return; - } - - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); - if(iDstDev == iSrcDev) - { - // Create the struct describing the copy. - ALPAKA_API_PREFIX(Memcpy3DParms) const uniformCudaHipMemCpy3DParms( - mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( - task)); - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memcpy3DAsync)( - &uniformCudaHipMemCpy3DParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Create the struct describing the copy. - cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( - task)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - cudaMemcpy3DPeerAsync( - &cudaMemCpy3DPeerParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif - } + task.enqueue(queue); } }; //############################################################################# @@ -1141,49 +1121,9 @@ namespace alpaka -> void { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - task.printDebug(); -#endif - // This is not only an optimization but also prevents a division by zero. - if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0) - { - return; - } - auto const & iDstDev(task.m_iDstDevice); - auto const & iSrcDev(task.m_iSrcDevice); + task.enqueue(queue); - if(iDstDev == iSrcDev) - { - // Create the struct describing the copy. - ALPAKA_API_PREFIX(Memcpy3DParms) const uniformCudaHipMemCpy3DParms( - mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( - task)); - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDstDev)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memcpy3DAsync)( - &uniformCudaHipMemCpy3DParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Create the struct describing the copy. - cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( - mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( - task)); - // Initiate the memory copy. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - cudaMemcpy3DPeerAsync( - &cudaMemCpy3DPeerParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif - } ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp index 72c5506411..801ef37410 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -56,15 +56,15 @@ namespace alpaka namespace detail { //############################################################################# - //! The CUDA memory set trait. + //! The CUDA/HIP memory set task base. template< typename TDim, typename TView, typename TExtent> - struct TaskSetUniformCudaHip + struct TaskSetUniformCudaHipBase { //----------------------------------------------------------------------------- - TaskSetUniformCudaHip( + TaskSetUniformCudaHipBase( TView & view, std::uint8_t const & byte, TExtent const & extent) : @@ -82,11 +82,249 @@ namespace alpaka "The destination view and the extent are required to have the same dimensionality!"); } + protected: TView & m_view; std::uint8_t const m_byte; TExtent const m_extent; std::int32_t const m_iDevice; }; + + //############################################################################# + //! The CUDA/HIP memory set task. + template< + typename TDim, + typename TView, + typename TExtent> + struct TaskSetUniformCudaHip; + + //############################################################################# + //! The 1D CUDA/HIP memory set task. + template< + typename TView, + typename TExtent> + struct TaskSetUniformCudaHip< + dim::DimInt<1>, + TView, + TExtent> + : public TaskSetUniformCudaHipBase, TView, TExtent> + { + //----------------------------------------------------------------------------- + TaskSetUniformCudaHip( + TView & view, + std::uint8_t const & byte, + TExtent const & extent) : + TaskSetUniformCudaHipBase, TView, TExtent>(view, byte, extent) + { + } + + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { + static_assert( + dim::Dim::value == 1u, + "The destination buffer is required to be 1-dimensional for this specialization!"); + static_assert( + dim::Dim::value == dim::Dim::value, + "The destination buffer and the extent are required to have the same dimensionality!"); + + using Idx = idx::Idx; + + auto & view(this->m_view); + auto const & extent(this->m_extent); + + auto const extentWidth(extent::getWidth(extent)); + + if(extentWidth == 0) + { + return; + } + + auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); +#if !defined(NDEBUG) + auto const dstWidth(extent::getWidth(view)); +#endif + auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); + ALPAKA_ASSERT(extentWidth <= dstWidth); + + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + this->m_iDevice)); + // Initiate the memory set. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemsetAsync)( + dstNativePtr, + static_cast(this->m_byte), + static_cast(extentWidthBytes), + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + }; + //############################################################################# + //! The 2D CUDA/HIP memory set task. + template< + typename TView, + typename TExtent> + struct TaskSetUniformCudaHip< + dim::DimInt<2>, + TView, + TExtent> + : public TaskSetUniformCudaHipBase, TView, TExtent> + { + //----------------------------------------------------------------------------- + TaskSetUniformCudaHip( + TView & view, + std::uint8_t const & byte, + TExtent const & extent) : + TaskSetUniformCudaHipBase, TView, TExtent>(view, byte, extent) + { + } + + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { + static_assert( + dim::Dim::value == 2u, + "The destination buffer is required to be 2-dimensional for this specialization!"); + static_assert( + dim::Dim::value == dim::Dim::value, + "The destination buffer and the extent are required to have the same dimensionality!"); + + using Idx = idx::Idx; + + auto & view(this->m_view); + auto const & extent(this->m_extent); + + auto const extentWidth(extent::getWidth(extent)); + auto const extentHeight(extent::getHeight(extent)); + + if(extentWidth == 0 || extentHeight == 0) + { + return; + } + + auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); + +#if !defined(NDEBUG) + auto const dstWidth(extent::getWidth(view)); + auto const dstHeight(extent::getHeight(view)); +#endif + auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); + auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); + ALPAKA_ASSERT(extentWidth <= dstWidth); + ALPAKA_ASSERT(extentHeight <= dstHeight); + + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + this->m_iDevice)); + // Initiate the memory set. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset2DAsync)( + dstNativePtr, + static_cast(dstPitchBytesX), + static_cast(this->m_byte), + static_cast(extentWidthBytes), + static_cast(extentHeight), + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + }; + //############################################################################# + //! The 3D CUDA/HIP memory set task. + template< + typename TView, + typename TExtent> + struct TaskSetUniformCudaHip< + dim::DimInt<3>, + TView, + TExtent> + : public TaskSetUniformCudaHipBase, TView, TExtent> + { + //----------------------------------------------------------------------------- + TaskSetUniformCudaHip( + TView & view, + std::uint8_t const & byte, + TExtent const & extent) : + TaskSetUniformCudaHipBase, TView, TExtent>(view, byte, extent) + { + } + + //----------------------------------------------------------------------------- + template< + typename TQueue + > + auto enqueue(TQueue & queue) const + -> void + { + static_assert( + dim::Dim::value == 3u, + "The destination buffer is required to be 3-dimensional for this specialization!"); + static_assert( + dim::Dim::value == dim::Dim::value, + "The destination buffer and the extent are required to have the same dimensionality!"); + + using Elem = alpaka::elem::Elem; + using Idx = idx::Idx; + + auto & view(this->m_view); + auto const & extent(this->m_extent); + + auto const extentWidth(extent::getWidth(extent)); + auto const extentHeight(extent::getHeight(extent)); + auto const extentDepth(extent::getDepth(extent)); + + // This is not only an optimization but also prevents a division by zero. + if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0) + { + return; + } + + auto const dstWidth(extent::getWidth(view)); +#if !defined(NDEBUG) + auto const dstHeight(extent::getHeight(view)); + auto const dstDepth(extent::getDepth(view)); +#endif + auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); + auto const dstPitchBytesY(mem::view::getPitchBytes::value - (2u % dim::Dim::value)>(view)); + auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); + ALPAKA_ASSERT(extentWidth <= dstWidth); + ALPAKA_ASSERT(extentHeight <= dstHeight); + ALPAKA_ASSERT(extentDepth <= dstDepth); + + // Fill CUDA parameter structures. + ALPAKA_API_PREFIX(PitchedPtr) const pitchedPtrVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( + dstNativePtr, + static_cast(dstPitchBytesX), + static_cast(dstWidth * static_cast(sizeof(Elem))), + static_cast(dstPitchBytesY / dstPitchBytesX))); + + ALPAKA_API_PREFIX(Extent) const extentVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( + static_cast(extentWidth * static_cast(sizeof(Elem))), + static_cast(extentHeight), + static_cast(extentDepth))); + + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + this->m_iDevice)); + // Initiate the memory set. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset3DAsync)( + pitchedPtrVal, + static_cast(this->m_byte), + extentVal, + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + }; } } namespace traits @@ -146,45 +384,7 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 1u, - "The destination buffer is required to be 1-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); - - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - - if(extentWidth == 0) - { - return; - } - - auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); -#if !defined(NDEBUG) - auto const dstWidth(extent::getWidth(view)); -#endif - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemsetAsync)( - dstNativePtr, - static_cast(byte), - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); + task.enqueue(queue); } }; //############################################################################# @@ -204,45 +404,8 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 1u, - "The destination buffer is required to be 1-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); + task.enqueue(queue); - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - - if(extentWidth == 0) - { - return; - } - - auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); -#if !defined(NDEBUG) - auto const dstWidth(extent::getWidth(view)); -#endif - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(MemsetAsync)( - dstNativePtr, - static_cast(byte), - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); wait::wait(queue); } }; @@ -263,52 +426,7 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 2u, - "The destination buffer is required to be 2-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); - - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - auto const extentHeight(extent::getHeight(extent)); - - if(extentWidth == 0 || extentHeight == 0) - { - return; - } - - auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); - -#if !defined(NDEBUG) - auto const dstWidth(extent::getWidth(view)); - auto const dstHeight(extent::getHeight(view)); -#endif - auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - ALPAKA_ASSERT(extentHeight <= dstHeight); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memset2DAsync)( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(byte), - static_cast(extentWidthBytes), - static_cast(extentHeight), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); + task.enqueue(queue); } }; //############################################################################# @@ -328,53 +446,7 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 2u, - "The destination buffer is required to be 2-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); - - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - auto const extentHeight(extent::getHeight(extent)); - - if(extentWidth == 0 || extentHeight == 0) - { - return; - } - - auto const extentWidthBytes(extentWidth * static_cast(sizeof(elem::Elem))); - -#if !defined(NDEBUG) - auto const dstWidth(extent::getWidth(view)); - auto const dstHeight(extent::getHeight(view)); -#endif - auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - ALPAKA_ASSERT(extentHeight <= dstHeight); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memset2DAsync)( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(byte), - static_cast(extentWidthBytes), - static_cast(extentHeight), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); + task.enqueue(queue); wait::wait(queue); } @@ -396,68 +468,7 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 3u, - "The destination buffer is required to be 3-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); - - using Elem = alpaka::elem::Elem; - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - auto const extentHeight(extent::getHeight(extent)); - auto const extentDepth(extent::getDepth(extent)); - - // This is not only an optimization but also prevents a division by zero. - if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0) - { - return; - } - - auto const dstWidth(extent::getWidth(view)); -#if !defined(NDEBUG) - auto const dstHeight(extent::getHeight(view)); - auto const dstDepth(extent::getDepth(view)); -#endif - auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); - auto const dstPitchBytesY(mem::view::getPitchBytes::value - (2u % dim::Dim::value)>(view)); - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - ALPAKA_ASSERT(extentHeight <= dstHeight); - ALPAKA_ASSERT(extentDepth <= dstDepth); - - // Fill CUDA parameter structures. - ALPAKA_API_PREFIX(PitchedPtr) const pitchedPtrVal( - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth * static_cast(sizeof(Elem))), - static_cast(dstPitchBytesY / dstPitchBytesX))); - - ALPAKA_API_PREFIX(Extent) const extentVal( - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( - static_cast(extentWidth * static_cast(sizeof(Elem))), - static_cast(extentHeight), - static_cast(extentDepth))); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memset3DAsync)( - pitchedPtrVal, - static_cast(byte), - extentVal, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); + task.enqueue(queue); } }; //############################################################################# @@ -477,68 +488,7 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - static_assert( - dim::Dim::value == 3u, - "The destination buffer is required to be 3-dimensional for this specialization!"); - static_assert( - dim::Dim::value == dim::Dim::value, - "The destination buffer and the extent are required to have the same dimensionality!"); - - using Elem = alpaka::elem::Elem; - using Idx = idx::Idx; - - auto & view(task.m_view); - auto const & byte(task.m_byte); - auto const & extent(task.m_extent); - auto const & iDevice(task.m_iDevice); - - auto const extentWidth(extent::getWidth(extent)); - auto const extentHeight(extent::getHeight(extent)); - auto const extentDepth(extent::getDepth(extent)); - - // This is not only an optimization but also prevents a division by zero. - if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0) - { - return; - } - - auto const dstWidth(extent::getWidth(view)); -#if !defined(NDEBUG) - auto const dstHeight(extent::getHeight(view)); - auto const dstDepth(extent::getDepth(view)); -#endif - auto const dstPitchBytesX(mem::view::getPitchBytes::value - 1u>(view)); - auto const dstPitchBytesY(mem::view::getPitchBytes::value - (2u % dim::Dim::value)>(view)); - auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); - ALPAKA_ASSERT(extentWidth <= dstWidth); - ALPAKA_ASSERT(extentHeight <= dstHeight); - ALPAKA_ASSERT(extentDepth <= dstDepth); - - // Fill CUDA parameter structures. - ALPAKA_API_PREFIX(PitchedPtr) const pitchedPtrVal( - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth * static_cast(sizeof(Elem))), - static_cast(dstPitchBytesY / dstPitchBytesX))); - - ALPAKA_API_PREFIX(Extent) const extentVal( - ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( - static_cast(extentWidth * static_cast(sizeof(Elem))), - static_cast(extentHeight), - static_cast(extentDepth))); - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - iDevice)); - // Initiate the memory set. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(Memset3DAsync)( - pitchedPtrVal, - static_cast(byte), - extentVal, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); + task.enqueue(queue); wait::wait(queue); } diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp index 681a40ead6..5450bc334b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -21,8 +21,6 @@ #include #include -#include - #include #include @@ -531,9 +529,8 @@ namespace alpaka -> vec::Vec, idx::Idx> { return - vec::createVecFromIndexedFnWorkaround< + vec::createVecFromIndexedFn< dim::Dim, - idx::Idx, detail::CreatePitchBytes>( pitch); } @@ -548,9 +545,8 @@ namespace alpaka { using IdxOffset = std::integral_constant(dim::Dim::value) - static_cast(TDim::value)>; return - vec::createVecFromIndexedFnOffsetWorkaround< + vec::createVecFromIndexedFnOffset< TDim, - idx::Idx, detail::CreatePitchBytes, IdxOffset>( pitch); diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp index 8009f53cc5..dec56aac8a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp index 97bd4e3c8c..35a6c84b79 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -35,13 +35,15 @@ namespace alpaka static_assert( !std::is_const::value, "The idx type of the view can not be const!"); + + using Dev = alpaka::dev::Dev; public: //----------------------------------------------------------------------------- template< typename TExtent> ALPAKA_FN_HOST ViewPlainPtr( TElem * pMem, - TDev const & dev, + Dev const & dev, TExtent const & extent = TExtent()) : m_pMem(pMem), m_dev(dev), @@ -55,7 +57,7 @@ namespace alpaka typename TPitch> ALPAKA_FN_HOST ViewPlainPtr( TElem * pMem, - TDev const dev, + Dev const dev, TExtent const & extent, TPitch const & pitchBytes) : m_pMem(pMem), @@ -74,7 +76,7 @@ namespace alpaka ViewPlainPtr(ViewPlainPtr const &) = default; //----------------------------------------------------------------------------- ALPAKA_FN_HOST - ViewPlainPtr(ViewPlainPtr && other) : + ViewPlainPtr(ViewPlainPtr && other) noexcept : m_pMem(other.m_pMem), m_dev(other.m_dev), m_extentElements(other.m_extentElements), @@ -110,7 +112,7 @@ namespace alpaka public: TElem * const m_pMem; - TDev const m_dev; + Dev const m_dev; vec::Vec const m_extentElements; vec::Vec const m_pitchBytes; }; @@ -133,7 +135,7 @@ namespace alpaka struct DevType< mem::view::ViewPlainPtr> { - using type = TDev; + using type = alpaka::dev::Dev; }; //############################################################################# @@ -148,7 +150,7 @@ namespace alpaka { static auto getDev( mem::view::ViewPlainPtr const & view) - -> TDev + -> alpaka::dev::Dev { return view.m_dev; } diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp index 3f4243758e..b90e71e3f7 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp index 860108261f..194f23ab69 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp index 2c384420bb..7549181c21 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -43,6 +43,9 @@ namespace alpaka static_assert( !std::is_const::value, "The idx type of the view can not be const!"); + + using Dev = alpaka::dev::Dev; + public: //----------------------------------------------------------------------------- //! Constructor. @@ -68,8 +71,8 @@ namespace alpaka ALPAKA_DEBUG_FULL_LOG_SCOPE; static_assert( - std::is_same>::value, - "The dev type of TView and the TDev template parameter have to be identical!"); + std::is_same>::value, + "The dev type of TView and the Dev template parameter have to be identical!"); static_assert( std::is_same>::value, @@ -117,8 +120,8 @@ namespace alpaka ALPAKA_DEBUG_FULL_LOG_SCOPE; static_assert( - std::is_same>::value, - "The dev type of TView and the TDev template parameter have to be identical!"); + std::is_same>::value, + "The dev type of TView and the Dev template parameter have to be identical!"); static_assert( std::is_same>::value, @@ -147,7 +150,7 @@ namespace alpaka //! \param view The view this view is a sub-view of. template< typename TView> - ViewSubView( + explicit ViewSubView( TView const & view) : ViewSubView( view, @@ -161,7 +164,7 @@ namespace alpaka //! \param view The view this view is a sub-view of. template< typename TView> - ViewSubView( + explicit ViewSubView( TView & view) : ViewSubView( view, @@ -172,7 +175,7 @@ namespace alpaka } public: - mem::view::ViewPlainPtr m_viewParentView; // This wraps the parent view. + mem::view::ViewPlainPtr m_viewParentView; // This wraps the parent view. vec::Vec m_extentElements; // The extent of this view. vec::Vec m_offsetsElements; // The offset relative to the parent view. }; @@ -195,7 +198,7 @@ namespace alpaka struct DevType< mem::view::ViewSubView> { - using type = TDev; + using type = alpaka::dev::Dev; }; //############################################################################# @@ -211,7 +214,7 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_FN_HOST static auto getDev( mem::view::ViewSubView const & view) - -> TDev + -> alpaka::dev::Dev { return dev::getDev( diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp index c7d4e7908a..e7cc3e6068 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp index f05bf12499..4dcb24dde3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include #include #include diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp index c9ae52bb86..70e20d0b11 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp index 2a10a52fef..30068ca8e4 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp index 4b0d15c5b6..c2b52ad944 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp index aa9c67c1e1..806e4476dd 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp index 53c3a94e04..942b83bee8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp index 8a344f161b..d365a22c18 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -9,7 +9,6 @@ #pragma once -#include #include #include @@ -63,14 +62,8 @@ namespace alpaka TArgs && ... args) -> void { - // Call the function object template call operator. -#if BOOST_COMP_MSVC && !BOOST_COMP_NVCC - f.operator()( - std::forward(args)...); -#else f.template operator()( std::forward(args)...); -#endif ForEachTypeHelper< TList> ::forEachTypeHelper( diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp index 00d99c0d83..a051a938f6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp index 196c2fc510..10707260b5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp index eafdb10bae..1253973e10 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp index 1bd095e216..55b0d2758c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp index c1817c80a7..fd550fea92 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp index 8bc184c517..4cf1d2f928 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp index 86d171c730..c918f922d4 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp index aeb2aea6ac..c022d4c8d7 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp index 22c2805fe9..18c0dc9485 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp index dbb6adc43a..9d869fc47f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp index e1f658e5cc..df438841f6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp index 61c2328762..9f99af429e 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp index abb9392f9a..342993a68b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -40,7 +40,7 @@ namespace alpaka namespace pltf { //############################################################################# - //! The CUDA RT device manager. + //! The CUDA/HIP RT platform. class PltfUniformCudaHipRt : public concepts::Implements { @@ -55,7 +55,7 @@ namespace alpaka namespace traits { //############################################################################# - //! The CUDA RT device manager device type trait specialization. + //! The CUDA/HIP RT platform device type trait specialization. template<> struct DevType< pltf::PltfUniformCudaHipRt> @@ -69,7 +69,7 @@ namespace alpaka namespace traits { //############################################################################# - //! The CPU platform device count get trait specialization. + //! The CUDA/HIP RT platform device count get trait specialization. template<> struct GetDevCount< pltf::PltfUniformCudaHipRt> @@ -90,7 +90,7 @@ namespace alpaka }; //############################################################################# - //! The CPU platform device get trait specialization. + //! The CUDA/HIP RT platform device get trait specialization. template<> struct GetDevByIdx< pltf::PltfUniformCudaHipRt> diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp index 9d7c27edda..b5dd3468eb 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -10,14 +10,11 @@ #pragma once #include -#include #include #include #include -#include - #include #include @@ -40,6 +37,16 @@ namespace alpaka typename TSfinae = void> struct PltfType; + template< + typename TPltf> + struct PltfType< + TPltf, + typename std::enable_if::value>::type + > + { + using type = typename concepts::ImplementationBase; + }; + //############################################################################# //! The device count get trait. template< @@ -69,7 +76,7 @@ namespace alpaka { return traits::GetDevCount< - TPltf> + Pltf> ::getDevCount(); } @@ -82,7 +89,7 @@ namespace alpaka { return traits::GetDevByIdx< - TPltf> + Pltf> ::getDevByIdx( devIdx); } @@ -92,19 +99,20 @@ namespace alpaka template< typename TPltf> ALPAKA_FN_HOST auto getDevs() - -> std::vector> + -> std::vector>> { - std::vector> devs; + std::vector>> devs; - std::size_t const devCount(getDevCount()); + std::size_t const devCount(getDevCount>()); for(std::size_t devIdx(0); devIdx < devCount; ++devIdx) { - devs.push_back(getDevByIdx(devIdx)); + devs.push_back(getDevByIdx>(devIdx)); } return devs; } } + namespace queue { namespace traits diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp index 67da1cac30..455536e723 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Rene Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,11 +19,11 @@ namespace alpaka { //############################################################################# //! The caller is waiting until the enqueued task is finished - struct Blocking{}; + struct Blocking; //############################################################################# //! The caller is NOT waiting until the enqueued task is finished - struct NonBlocking{}; + struct NonBlocking; } using namespace property; diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp index fe02591d7f..6e624bd275 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp @@ -1,6 +1,6 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner +/* Copyright 2020 Jeffrey Kelling * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -9,232 +9,13 @@ #pragma once -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -namespace alpaka -{ - namespace event - { - class EventCpu; - } -} +#include +#include namespace alpaka { namespace queue { - namespace cpu - { - namespace detail - { -#if BOOST_COMP_CLANG - // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" - // https://stackoverflow.com/a/29288300 - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-vtables" -#endif - //############################################################################# - //! The CPU device queue implementation. - class QueueCpuBlockingImpl final : public cpu::ICpuQueue -#if BOOST_COMP_CLANG - #pragma clang diagnostic pop -#endif - { - public: - //----------------------------------------------------------------------------- - QueueCpuBlockingImpl( - dev::DevCpu const & dev) noexcept : - m_dev(dev), - m_bCurrentlyExecutingTask(false) - {} - //----------------------------------------------------------------------------- - QueueCpuBlockingImpl(QueueCpuBlockingImpl const &) = delete; - //----------------------------------------------------------------------------- - QueueCpuBlockingImpl(QueueCpuBlockingImpl &&) = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuBlockingImpl const &) -> QueueCpuBlockingImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuBlockingImpl &&) -> QueueCpuBlockingImpl & = delete; - - //----------------------------------------------------------------------------- - void enqueue(event::EventCpu & ev) final - { - queue::enqueue(*this, ev); - } - - //----------------------------------------------------------------------------- - void wait(event::EventCpu const & ev) final - { - wait::wait(*this, ev); - } - - public: - dev::DevCpu const m_dev; //!< The device this queue is bound to. - std::mutex mutable m_mutex; - std::atomic m_bCurrentlyExecutingTask; - }; - } - } - - //############################################################################# - //! The CPU device queue. - class QueueCpuBlocking final : public concepts::Implements - { - public: - //----------------------------------------------------------------------------- - QueueCpuBlocking( - dev::DevCpu const & dev) : - m_spQueueImpl(std::make_shared(dev)) - { - dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl); - } - //----------------------------------------------------------------------------- - QueueCpuBlocking(QueueCpuBlocking const &) = default; - //----------------------------------------------------------------------------- - QueueCpuBlocking(QueueCpuBlocking &&) = default; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuBlocking const &) -> QueueCpuBlocking & = default; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuBlocking &&) -> QueueCpuBlocking & = default; - //----------------------------------------------------------------------------- - auto operator==(QueueCpuBlocking const & rhs) const - -> bool - { - return (m_spQueueImpl == rhs.m_spQueueImpl); - } - //----------------------------------------------------------------------------- - auto operator!=(QueueCpuBlocking const & rhs) const - -> bool - { - return !((*this) == rhs); - } - //----------------------------------------------------------------------------- - ~QueueCpuBlocking() = default; - - public: - std::shared_ptr m_spQueueImpl; - }; - } - - namespace dev - { - namespace traits - { - //############################################################################# - //! The CPU blocking device queue device type trait specialization. - template<> - struct DevType< - queue::QueueCpuBlocking> - { - using type = dev::DevCpu; - }; - //############################################################################# - //! The CPU blocking device queue device get trait specialization. - template<> - struct GetDev< - queue::QueueCpuBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto getDev( - queue::QueueCpuBlocking const & queue) - -> dev::DevCpu - { - return queue.m_spQueueImpl->m_dev; - } - }; - } - } - namespace event - { - namespace traits - { - //############################################################################# - //! The CPU blocking device queue event type trait specialization. - template<> - struct EventType< - queue::QueueCpuBlocking> - { - using type = event::EventCpu; - }; - } - } - namespace queue - { - namespace traits - { - //############################################################################# - //! The CPU blocking device queue enqueue trait specialization. - //! This default implementation for all tasks directly invokes the function call operator of the task. - template< - typename TTask> - struct Enqueue< - queue::QueueCpuBlocking, - TTask> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( - queue::QueueCpuBlocking & queue, - TTask const & task) - -> void - { - std::lock_guard lk(queue.m_spQueueImpl->m_mutex); - - queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true; - - task(); - - queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false; - } - }; - //############################################################################# - //! The CPU blocking device queue test trait specialization. - template<> - struct Empty< - queue::QueueCpuBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto empty( - queue::QueueCpuBlocking const & queue) - -> bool - { - return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask; - } - }; - } - } - - namespace wait - { - namespace traits - { - //############################################################################# - //! The CPU blocking device queue thread wait trait specialization. - //! - //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) - template<> - struct CurrentThreadWaitFor< - queue::QueueCpuBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - queue::QueueCpuBlocking const & queue) - -> void - { - std::lock_guard lk(queue.m_spQueueImpl->m_mutex); - } - }; - } + using QueueCpuBlocking = QueueGenericThreadsBlocking; } } - -#include diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp index 94030569d1..3aea5c93d5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp @@ -1,6 +1,6 @@ -/* Copyright 2019 Benjamin Worpitz, Matthias Werner +/* Copyright 2020 Jeffrey Kelling * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -9,226 +9,13 @@ #pragma once -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -namespace alpaka -{ - namespace event - { - class EventCpu; - } -} +#include +#include namespace alpaka { namespace queue { - namespace cpu - { - namespace detail - { -#if BOOST_COMP_CLANG - // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" - // https://stackoverflow.com/a/29288300 - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-vtables" -#endif - //############################################################################# - //! The CPU device queue implementation. - class QueueCpuNonBlockingImpl final : public cpu::ICpuQueue -#if BOOST_COMP_CLANG - #pragma clang diagnostic pop -#endif - { - private: - //############################################################################# - using ThreadPool = alpaka::core::detail::ConcurrentExecPool< - std::size_t, - std::thread, // The concurrent execution type. - std::promise, // The promise type. - void, // The type yielding the current concurrent execution. - std::mutex, // The mutex type to use. Only required if TisYielding is true. - std::condition_variable, // The condition variable type to use. Only required if TisYielding is true. - false>; // If the threads should yield. - - public: - //----------------------------------------------------------------------------- - QueueCpuNonBlockingImpl( - dev::DevCpu const & dev) : - m_dev(dev), - m_workerThread(1u) - {} - //----------------------------------------------------------------------------- - QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl const &) = delete; - //----------------------------------------------------------------------------- - QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl &&) = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuNonBlockingImpl const &) -> QueueCpuNonBlockingImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuNonBlockingImpl &&) -> QueueCpuNonBlockingImpl & = delete; - - //----------------------------------------------------------------------------- - void enqueue(event::EventCpu & ev) final - { - queue::enqueue(*this, ev); - } - - //----------------------------------------------------------------------------- - void wait(event::EventCpu const & ev) final - { - wait::wait(*this, ev); - } - - public: - dev::DevCpu const m_dev; //!< The device this queue is bound to. - - ThreadPool m_workerThread; - }; - } - } - - //############################################################################# - //! The CPU device queue. - class QueueCpuNonBlocking final : public concepts::Implements - { - public: - //----------------------------------------------------------------------------- - QueueCpuNonBlocking( - dev::DevCpu const & dev) : - m_spQueueImpl(std::make_shared(dev)) - { - dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl); - } - //----------------------------------------------------------------------------- - QueueCpuNonBlocking(QueueCpuNonBlocking const &) = default; - //----------------------------------------------------------------------------- - QueueCpuNonBlocking(QueueCpuNonBlocking &&) = default; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuNonBlocking const &) -> QueueCpuNonBlocking & = default; - //----------------------------------------------------------------------------- - auto operator=(QueueCpuNonBlocking &&) -> QueueCpuNonBlocking & = default; - //----------------------------------------------------------------------------- - auto operator==(QueueCpuNonBlocking const & rhs) const - -> bool - { - return (m_spQueueImpl == rhs.m_spQueueImpl); - } - //----------------------------------------------------------------------------- - auto operator!=(QueueCpuNonBlocking const & rhs) const - -> bool - { - return !((*this) == rhs); - } - //----------------------------------------------------------------------------- - ~QueueCpuNonBlocking() = default; - - public: - std::shared_ptr m_spQueueImpl; - }; - } - - namespace dev - { - namespace traits - { - //############################################################################# - //! The CPU non-blocking device queue device type trait specialization. - template<> - struct DevType< - queue::QueueCpuNonBlocking> - { - using type = dev::DevCpu; - }; - //############################################################################# - //! The CPU non-blocking device queue device get trait specialization. - template<> - struct GetDev< - queue::QueueCpuNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto getDev( - queue::QueueCpuNonBlocking const & queue) - -> dev::DevCpu - { - return queue.m_spQueueImpl->m_dev; - } - }; - } - } - namespace event - { - namespace traits - { - //############################################################################# - //! The CPU non-blocking device queue event type trait specialization. - template<> - struct EventType< - queue::QueueCpuNonBlocking> - { - using type = event::EventCpu; - }; - } - } - namespace queue - { - namespace traits - { - //############################################################################# - //! The CPU non-blocking device queue enqueue trait specialization. - //! This default implementation for all tasks directly invokes the function call operator of the task. - template< - typename TTask> - struct Enqueue< - queue::QueueCpuNonBlocking, - TTask> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto enqueue( -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - queue::QueueCpuNonBlocking & queue, - TTask const & task) -#else - queue::QueueCpuNonBlocking &, - TTask const &) -#endif - -> void - { -// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. -#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) - queue.m_spQueueImpl->m_workerThread.enqueueTask( - task); -#endif - } - }; - //############################################################################# - //! The CPU non-blocking device queue test trait specialization. - template<> - struct Empty< - queue::QueueCpuNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto empty( - queue::QueueCpuNonBlocking const & queue) - -> bool - { - return queue.m_spQueueImpl->m_workerThread.isIdle(); - } - }; - } + using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking; } } - -#include diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp new file mode 100644 index 0000000000..8511ffc667 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp @@ -0,0 +1,257 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace alpaka +{ + namespace event + { + template + class EventGenericThreads; + } +} + +namespace alpaka +{ + namespace queue + { + namespace generic + { + namespace detail + { +#if BOOST_COMP_CLANG + // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" + // https://stackoverflow.com/a/29288300 + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wweak-vtables" +#endif + //############################################################################# + //! The CPU device queue implementation. + template< + typename TDev> + class QueueGenericThreadsBlockingImpl final : public IGenericThreadsQueue +#if BOOST_COMP_CLANG + #pragma clang diagnostic pop +#endif + { + public: + //----------------------------------------------------------------------------- + explicit QueueGenericThreadsBlockingImpl( + TDev const & dev) noexcept : + m_dev(dev), + m_bCurrentlyExecutingTask(false) + {} + //----------------------------------------------------------------------------- + QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl const &) = delete; + //----------------------------------------------------------------------------- + QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsBlockingImpl const &) -> QueueGenericThreadsBlockingImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsBlockingImpl &&) -> QueueGenericThreadsBlockingImpl & = delete; + + //----------------------------------------------------------------------------- + void enqueue(event::EventGenericThreads & ev) final + { + queue::enqueue(*this, ev); + } + + //----------------------------------------------------------------------------- + void wait(event::EventGenericThreads const & ev) final + { + wait::wait(*this, ev); + } + + public: + TDev const m_dev; //!< The device this queue is bound to. + std::mutex mutable m_mutex; + std::atomic m_bCurrentlyExecutingTask; + }; + } + } + + //############################################################################# + //! The CPU device queue. + template< + typename TDev> + class QueueGenericThreadsBlocking final + : public concepts::Implements> + , public concepts::Implements> + , public concepts::Implements> + { + public: + //----------------------------------------------------------------------------- + explicit QueueGenericThreadsBlocking( + TDev const & dev) : + m_spQueueImpl(std::make_shared>(dev)) + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + + dev.registerQueue(m_spQueueImpl); + } + //----------------------------------------------------------------------------- + QueueGenericThreadsBlocking(QueueGenericThreadsBlocking const &) = default; + //----------------------------------------------------------------------------- + QueueGenericThreadsBlocking(QueueGenericThreadsBlocking &&) = default; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsBlocking const &) -> QueueGenericThreadsBlocking & = default; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsBlocking &&) -> QueueGenericThreadsBlocking & = default; + //----------------------------------------------------------------------------- + auto operator==(QueueGenericThreadsBlocking const & rhs) const + -> bool + { + return (m_spQueueImpl == rhs.m_spQueueImpl); + } + //----------------------------------------------------------------------------- + auto operator!=(QueueGenericThreadsBlocking const & rhs) const + -> bool + { + return !((*this) == rhs); + } + //----------------------------------------------------------------------------- + ~QueueGenericThreadsBlocking() = default; + + public: + std::shared_ptr> m_spQueueImpl; + }; + } + + namespace dev + { + namespace traits + { + //############################################################################# + //! The CPU blocking device queue device type trait specialization. + template< + typename TDev> + struct DevType< + queue::QueueGenericThreadsBlocking> + { + using type = TDev; + }; + //############################################################################# + //! The CPU blocking device queue device get trait specialization. + template< + typename TDev> + struct GetDev< + queue::QueueGenericThreadsBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getDev( + queue::QueueGenericThreadsBlocking const & queue) + -> TDev + { + return queue.m_spQueueImpl->m_dev; + } + }; + } + } + namespace event + { + namespace traits + { + //############################################################################# + //! The CPU blocking device queue event type trait specialization. + template< + typename TDev> + struct EventType< + queue::QueueGenericThreadsBlocking> + { + using type = event::EventGenericThreads; + }; + } + } + namespace queue + { + namespace traits + { + //############################################################################# + //! The CPU blocking device queue enqueue trait specialization. + //! This default implementation for all tasks directly invokes the function call operator of the task. + template< + typename TDev, + typename TTask> + struct Enqueue< + queue::QueueGenericThreadsBlocking, + TTask> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::QueueGenericThreadsBlocking & queue, + TTask const & task) + -> void + { + std::lock_guard lk(queue.m_spQueueImpl->m_mutex); + + queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true; + + task(); + + queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false; + } + }; + //############################################################################# + //! The CPU blocking device queue test trait specialization. + template< + typename TDev> + struct Empty< + queue::QueueGenericThreadsBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto empty( + queue::QueueGenericThreadsBlocking const & queue) + -> bool + { + return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask; + } + }; + } + } + + namespace wait + { + namespace traits + { + //############################################################################# + //! The CPU blocking device queue thread wait trait specialization. + //! + //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) + template< + typename TDev> + struct CurrentThreadWaitFor< + queue::QueueGenericThreadsBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto currentThreadWaitFor( + queue::QueueGenericThreadsBlocking const & queue) + -> void + { + std::lock_guard lk(queue.m_spQueueImpl->m_mutex); + } + }; + } + } +} + +#include diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp new file mode 100644 index 0000000000..c08d87d269 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp @@ -0,0 +1,246 @@ +/* Copyright 2019 Benjamin Worpitz, Matthias Werner + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace alpaka +{ + namespace event + { + template + class EventGenericThreads; + } +} + +namespace alpaka +{ + namespace queue + { + namespace generic + { + namespace detail + { +#if BOOST_COMP_CLANG + // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" + // https://stackoverflow.com/a/29288300 + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wweak-vtables" +#endif + //############################################################################# + //! The CPU device queue implementation. + template< + typename TDev> + class QueueGenericThreadsNonBlockingImpl final : public IGenericThreadsQueue +#if BOOST_COMP_CLANG + #pragma clang diagnostic pop +#endif + { + private: + //############################################################################# + using ThreadPool = alpaka::core::detail::ConcurrentExecPool< + std::size_t, + std::thread, // The concurrent execution type. + std::promise, // The promise type. + void, // The type yielding the current concurrent execution. + std::mutex, // The mutex type to use. Only required if TisYielding is true. + std::condition_variable, // The condition variable type to use. Only required if TisYielding is true. + false>; // If the threads should yield. + + public: + //----------------------------------------------------------------------------- + explicit QueueGenericThreadsNonBlockingImpl( + TDev const & dev) : + m_dev(dev), + m_workerThread(1u) + {} + //----------------------------------------------------------------------------- + QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl const &) = delete; + //----------------------------------------------------------------------------- + QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsNonBlockingImpl const &) -> QueueGenericThreadsNonBlockingImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsNonBlockingImpl &&) -> QueueGenericThreadsNonBlockingImpl & = delete; + + //----------------------------------------------------------------------------- + void enqueue(event::EventGenericThreads & ev) final + { + queue::enqueue(*this, ev); + } + + //----------------------------------------------------------------------------- + void wait(event::EventGenericThreads const & ev) final + { + wait::wait(*this, ev); + } + + public: + TDev const m_dev; //!< The device this queue is bound to. + + ThreadPool m_workerThread; + }; + } + } + + //############################################################################# + //! The CPU device queue. + template< + typename TDev> + class QueueGenericThreadsNonBlocking final + : public concepts::Implements> + , public concepts::Implements> + , public concepts::Implements> + { + public: + //----------------------------------------------------------------------------- + explicit QueueGenericThreadsNonBlocking( + TDev const & dev) : + m_spQueueImpl(std::make_shared>(dev)) + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + + dev.registerQueue(m_spQueueImpl); + } + //----------------------------------------------------------------------------- + QueueGenericThreadsNonBlocking(QueueGenericThreadsNonBlocking const &) = default; + //----------------------------------------------------------------------------- + QueueGenericThreadsNonBlocking(QueueGenericThreadsNonBlocking &&) = default; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsNonBlocking const &) -> QueueGenericThreadsNonBlocking & = default; + //----------------------------------------------------------------------------- + auto operator=(QueueGenericThreadsNonBlocking &&) -> QueueGenericThreadsNonBlocking & = default; + //----------------------------------------------------------------------------- + auto operator==(QueueGenericThreadsNonBlocking const & rhs) const + -> bool + { + return (m_spQueueImpl == rhs.m_spQueueImpl); + } + //----------------------------------------------------------------------------- + auto operator!=(QueueGenericThreadsNonBlocking const & rhs) const + -> bool + { + return !((*this) == rhs); + } + //----------------------------------------------------------------------------- + ~QueueGenericThreadsNonBlocking() = default; + + public: + std::shared_ptr> m_spQueueImpl; + }; + } + + namespace dev + { + namespace traits + { + //############################################################################# + //! The CPU non-blocking device queue device type trait specialization. + template< + typename TDev> + struct DevType< + queue::QueueGenericThreadsNonBlocking> + { + using type = TDev; + }; + //############################################################################# + //! The CPU non-blocking device queue device get trait specialization. + template< + typename TDev> + struct GetDev< + queue::QueueGenericThreadsNonBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getDev( + queue::QueueGenericThreadsNonBlocking const & queue) + -> TDev + { + return queue.m_spQueueImpl->m_dev; + } + }; + } + } + namespace event + { + namespace traits + { + //############################################################################# + //! The CPU non-blocking device queue event type trait specialization. + template< + typename TDev> + struct EventType< + queue::QueueGenericThreadsNonBlocking> + { + using type = event::EventGenericThreads; + }; + } + } + namespace queue + { + namespace traits + { + //############################################################################# + //! The CPU non-blocking device queue enqueue trait specialization. + //! This default implementation for all tasks directly invokes the function call operator of the task. + template< + typename TDev, + typename TTask> + struct Enqueue< + queue::QueueGenericThreadsNonBlocking, + TTask> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto enqueue( + queue::QueueGenericThreadsNonBlocking & queue, + TTask const & task) + -> void + { +// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp. +#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX) + queue.m_spQueueImpl->m_workerThread.enqueueTask( + task); +#else + alpaka::ignore_unused(queue); + alpaka::ignore_unused(task); +#endif + } + }; + //############################################################################# + //! The CPU non-blocking device queue test trait specialization. + template< + typename TDev> + struct Empty< + queue::QueueGenericThreadsNonBlocking> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto empty( + queue::QueueGenericThreadsNonBlocking const & queue) + -> bool + { + return queue.m_spQueueImpl->m_workerThread.isIdle(); + } + }; + } + } +} + +#include diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp index a1140bb544..b4ee1ff037 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -21,12 +21,14 @@ #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! #endif -#include +#include +#include +#include #include #include #include -#include +#include // Backend specific includes. #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) @@ -54,88 +56,16 @@ namespace alpaka { namespace queue { - namespace uniform_cuda_hip - { - namespace detail - { - //############################################################################# - //! The CUDA/HIP RT blocking queue implementation. - class QueueUniformCudaHipRtBlockingImpl final - { - public: - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST QueueUniformCudaHipRtBlockingImpl( - dev::DevUniformCudaHipRt const & dev) : - m_dev(dev), - m_UniformCudaHipQueue() - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - m_dev.m_iDevice)); - - // - [cuda/hip]StreamDefault: Default queue creation flag. - // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue), - // and that the created queue should perform no implicit synchronization with queue 0. - // Create the queue on the current device. - // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue. - // It would be too much work to implement implicit default queue synchronization on CPU. - - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamCreateWithFlags)( - &m_UniformCudaHipQueue, - ALPAKA_API_PREFIX(StreamNonBlocking))); - - } - //----------------------------------------------------------------------------- - QueueUniformCudaHipRtBlockingImpl(QueueUniformCudaHipRtBlockingImpl const &) = delete; - //----------------------------------------------------------------------------- - QueueUniformCudaHipRtBlockingImpl(QueueUniformCudaHipRtBlockingImpl &&) = default; - //----------------------------------------------------------------------------- - auto operator=(QueueUniformCudaHipRtBlockingImpl const &) -> QueueUniformCudaHipRtBlockingImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueUniformCudaHipRtBlockingImpl &&) -> QueueUniformCudaHipRtBlockingImpl & = delete; - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST ~QueueUniformCudaHipRtBlockingImpl() - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Set the current device. \TODO: Is setting the current device before [cuda/hip]StreamDestroy required? - - // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called, the function will return immediately - // and the resources associated with queue will be released automatically once the device has completed all work in queue. - // -> No need to synchronize here. - - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - m_dev.m_iDevice)); - // In case the device is still doing work in the queue when cuda/hip-StreamDestroy() is called, the function will return immediately - // and the resources associated with queue will be released automatically once the device has completed all work in queue. - // -> No need to synchronize here. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamDestroy)( - m_UniformCudaHipQueue)); - - } - - public: - dev::DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to. - ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue; - }; - } - } //############################################################################# //! The CUDA/HIP RT blocking queue. - class QueueUniformCudaHipRtBlocking final : public concepts::Implements + class QueueUniformCudaHipRtBlocking final : public uniform_cuda_hip::detail::QueueUniformCudaHipRtBase { public: //----------------------------------------------------------------------------- ALPAKA_FN_HOST QueueUniformCudaHipRtBlocking( dev::DevUniformCudaHipRt const & dev) : - m_spQueueImpl(std::make_shared(dev)) + uniform_cuda_hip::detail::QueueUniformCudaHipRtBase(dev) {} //----------------------------------------------------------------------------- QueueUniformCudaHipRtBlocking(QueueUniformCudaHipRtBlocking const &) = default; @@ -159,9 +89,6 @@ namespace alpaka } //----------------------------------------------------------------------------- ~QueueUniformCudaHipRtBlocking() = default; - - public: - std::shared_ptr m_spQueueImpl; }; #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) @@ -183,20 +110,6 @@ namespace alpaka { using type = dev::DevUniformCudaHipRt; }; - //############################################################################# - //! The CUDA/HIP RT blocking queue device get trait specialization. - template<> - struct GetDev< - queue::QueueUniformCudaHipRtBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto getDev( - queue::QueueUniformCudaHipRtBlocking const & queue) - -> dev::DevUniformCudaHipRt - { - return queue.m_spQueueImpl->m_dev; - } - }; } } namespace event @@ -318,53 +231,6 @@ namespace alpaka t.join(); } }; - //############################################################################# - //! The CUDA/HIP RT blocking queue test trait specialization. - template<> - struct Empty< - queue::QueueUniformCudaHipRtBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto empty( - queue::QueueUniformCudaHipRtBlocking const & queue) - -> bool - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Query is allowed even for queues on non current device. - ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( - ret = ALPAKA_API_PREFIX(StreamQuery)(queue.m_spQueueImpl->m_UniformCudaHipQueue), - ALPAKA_API_PREFIX(ErrorNotReady)); - return (ret == ALPAKA_API_PREFIX(Success)); - } - }; - } - } - namespace wait - { - namespace traits - { - //############################################################################# - //! The CUDA/HIP RT blocking queue thread wait trait specialization. - //! - //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) - template<> - struct CurrentThreadWaitFor< - queue::QueueUniformCudaHipRtBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - queue::QueueUniformCudaHipRtBlocking const & queue) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Sync is allowed even for queues on non current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - }; } } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp index cf482b3d43..1273850e8a 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -21,12 +21,13 @@ #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! #endif -#include +#include +#include +#include #include #include #include -#include #include // Backend specific includes. @@ -55,84 +56,15 @@ namespace alpaka { namespace queue { - namespace uniform_cuda_hip - { - namespace detail - { - //############################################################################# - //! The CUDA/HIP RT non-blocking queue implementation. - class QueueUniformCudaHipRtNonBlockingImpl final - { - public: - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST QueueUniformCudaHipRtNonBlockingImpl( - dev::DevUniformCudaHipRt const & dev) : - m_dev(dev), - m_UniformCudaHipQueue() - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - - // - [cuda/hip]StreamDefault: Default queue creation flag. - // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue), - // and that the created queue should perform no implicit synchronization with queue 0. - // Create the queue on the current device. - // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue. - // It would be too much work to implement implicit default queue synchronization on CPU. - // Set the current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - m_dev.m_iDevice)); - - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamCreateWithFlags)( - &m_UniformCudaHipQueue, - ALPAKA_API_PREFIX(StreamNonBlocking))); - } - //----------------------------------------------------------------------------- - QueueUniformCudaHipRtNonBlockingImpl(QueueUniformCudaHipRtNonBlockingImpl const &) = delete; - //----------------------------------------------------------------------------- - QueueUniformCudaHipRtNonBlockingImpl(QueueUniformCudaHipRtNonBlockingImpl &&) = default; - //----------------------------------------------------------------------------- - auto operator=(QueueUniformCudaHipRtNonBlockingImpl const &) -> QueueUniformCudaHipRtNonBlockingImpl & = delete; - //----------------------------------------------------------------------------- - auto operator=(QueueUniformCudaHipRtNonBlockingImpl &&) -> QueueUniformCudaHipRtNonBlockingImpl & = delete; - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST ~QueueUniformCudaHipRtNonBlockingImpl() - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Set the current device. \TODO: Is setting the current device before cuda/hip-StreamDestroy required? - - // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called, the function will return immediately - // and the resources associated with queue will be released automatically once the device has completed all work in queue. - // -> No need to synchronize here. - - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(SetDevice)( - m_dev.m_iDevice)); - - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamDestroy)( - m_UniformCudaHipQueue)); - } - - public: - dev::DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to. - ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue; - }; - } - } - //############################################################################# //! The CUDA/HIP RT non-blocking queue. - class QueueUniformCudaHipRtNonBlocking final : public concepts::Implements + class QueueUniformCudaHipRtNonBlocking final : public uniform_cuda_hip::detail::QueueUniformCudaHipRtBase { public: //----------------------------------------------------------------------------- ALPAKA_FN_HOST QueueUniformCudaHipRtNonBlocking( dev::DevUniformCudaHipRt const & dev) : - m_spQueueImpl(std::make_shared(dev)) + uniform_cuda_hip::detail::QueueUniformCudaHipRtBase(dev) {} //----------------------------------------------------------------------------- QueueUniformCudaHipRtNonBlocking(QueueUniformCudaHipRtNonBlocking const &) = default; @@ -156,9 +88,6 @@ namespace alpaka } //----------------------------------------------------------------------------- ~QueueUniformCudaHipRtNonBlocking() = default; - - public: - std::shared_ptr m_spQueueImpl; }; #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) @@ -180,20 +109,6 @@ namespace alpaka { using type = dev::DevUniformCudaHipRt; }; - //############################################################################# - //! The CUDA/HIP RT non-blocking queue device get trait specialization. - template<> - struct GetDev< - queue::QueueUniformCudaHipRtNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto getDev( - queue::QueueUniformCudaHipRtNonBlocking const & queue) - -> dev::DevUniformCudaHipRt - { - return queue.m_spQueueImpl->m_dev; - } - }; } } namespace event @@ -322,54 +237,6 @@ namespace alpaka t.detach(); } }; - //############################################################################# - //! The CUDA/HIP RT non-blocking queue test trait specialization. - template<> - struct Empty< - queue::QueueUniformCudaHipRtNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto empty( - queue::QueueUniformCudaHipRtNonBlocking const & queue) - -> bool - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Query is allowed even for queues on non current device. - ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( - ret = ALPAKA_API_PREFIX(StreamQuery)( - queue.m_spQueueImpl->m_UniformCudaHipQueue), - ALPAKA_API_PREFIX(ErrorNotReady)); - return (ret == ALPAKA_API_PREFIX(Success)); - } - }; - } - } - namespace wait - { - namespace traits - { - //############################################################################# - //! The CUDA/HIP RT non-blocking queue thread wait trait specialization. - //! - //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) - template<> - struct CurrentThreadWaitFor< - queue::QueueUniformCudaHipRtNonBlocking> - { - //----------------------------------------------------------------------------- - ALPAKA_FN_HOST static auto currentThreadWaitFor( - queue::QueueUniformCudaHipRtNonBlocking const & queue) - -> void - { - ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - - // Sync is allowed even for queues on non current device. - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - }; } } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp index 5d1785c3ba..700b4a2db6 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,12 +16,16 @@ #include #include +#include + namespace alpaka { //----------------------------------------------------------------------------- //! The queue specifics. namespace queue { + struct ConceptQueue; + //----------------------------------------------------------------------------- //! The queue traits. namespace traits @@ -80,19 +84,20 @@ namespace alpaka TQueue const & queue) -> bool { + using ImplementationBase = concepts::ImplementationBase; return traits::Empty< - TQueue> + ImplementationBase> ::empty( queue); } //----------------------------------------------------------------------------- //! Queue based on the environment and a property - // - // \tparam TEnv Environment type, e.g. accelerator, device or a platform. - // queue::traits::QueueType must be specialized for TEnv - // \tparam TProperty Property to define the behavior of TEnv. + //! + //! \tparam TEnv Environment type, e.g. accelerator, device or a platform. + //! queue::traits::QueueType must be specialized for TEnv + //! \tparam TProperty Property to define the behavior of TEnv. template< typename TEnv, typename TProperty> diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp index d13cc0f87e..56b249d2f9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -9,15 +9,8 @@ #pragma once -#include - -namespace alpaka -{ - namespace event - { - class EventCpu; - } -} +#include +#include namespace alpaka { @@ -25,32 +18,9 @@ namespace alpaka { namespace cpu { - - -#if BOOST_COMP_CLANG - // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" - // https://stackoverflow.com/a/29288300 - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-vtables" -#endif - //############################################################################# //! The CPU queue interface - class ICpuQueue - { - public: - //----------------------------------------------------------------------------- - //! enqueue the event - virtual void enqueue(event::EventCpu &) = 0; - //----------------------------------------------------------------------------- - //! waiting for the event - virtual void wait(event::EventCpu const &) = 0; - //----------------------------------------------------------------------------- - virtual ~ICpuQueue() = default; - }; -#if BOOST_COMP_CLANG - #pragma clang diagnostic pop -#endif + using ICpuQueue = IGenericThreadsQueue; } } } diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp new file mode 100644 index 0000000000..21d67f2472 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp @@ -0,0 +1,54 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +namespace alpaka +{ + namespace event + { + template + class EventGenericThreads; + } +} + +namespace alpaka +{ + namespace queue + { +#if BOOST_COMP_CLANG + // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]" + // https://stackoverflow.com/a/29288300 + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wweak-vtables" +#endif + + //############################################################################# + //! The CPU queue interface + template< + typename TDev> + class IGenericThreadsQueue + { + public: + //----------------------------------------------------------------------------- + //! enqueue the event + virtual void enqueue(event::EventGenericThreads &) = 0; + //----------------------------------------------------------------------------- + //! waiting for the event + virtual void wait(event::EventGenericThreads const &) = 0; + //----------------------------------------------------------------------------- + virtual ~IGenericThreadsQueue() = default; + }; +#if BOOST_COMP_CLANG + #pragma clang diagnostic pop +#endif + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp new file mode 100644 index 0000000000..54a014a493 --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp @@ -0,0 +1,232 @@ +/* Copyright 2020 Benjamin Worpitz, Matthias Werner, René Widera + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) + +#include + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA +#error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP +#error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +#include +#include +#include +#include + +// Backend specific includes. +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) +#include +#else +#include +#endif + +#include + +namespace alpaka +{ + namespace queue + { + namespace uniform_cuda_hip + { + namespace detail + { + //############################################################################# + //! The CUDA/HIP RT blocking queue implementation. + class QueueUniformCudaHipRtImpl final + { + public: + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST QueueUniformCudaHipRtImpl( + dev::DevUniformCudaHipRt const & dev) : + m_dev(dev), + m_UniformCudaHipQueue() + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Set the current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + m_dev.m_iDevice)); + + // - [cuda/hip]StreamDefault: Default queue creation flag. + // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue), + // and that the created queue should perform no implicit synchronization with queue 0. + // Create the queue on the current device. + // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue. + // It would be too much work to implement implicit default queue synchronization on CPU. + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamCreateWithFlags)( + &m_UniformCudaHipQueue, + ALPAKA_API_PREFIX(StreamNonBlocking))); + + } + //----------------------------------------------------------------------------- + QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl const &) = delete; + //----------------------------------------------------------------------------- + QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl &&) = default; + //----------------------------------------------------------------------------- + auto operator=(QueueUniformCudaHipRtImpl const &) -> QueueUniformCudaHipRtImpl & = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueUniformCudaHipRtImpl &&) -> QueueUniformCudaHipRtImpl & = delete; + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl() + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Set the current device. \TODO: Is setting the current device before [cuda/hip]StreamDestroy required? + + // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called, the function will return immediately + // and the resources associated with queue will be released automatically once the device has completed all work in queue. + // -> No need to synchronize here. + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( + m_dev.m_iDevice)); + // In case the device is still doing work in the queue when cuda/hip-StreamDestroy() is called, the function will return immediately + // and the resources associated with queue will be released automatically once the device has completed all work in queue. + // -> No need to synchronize here. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamDestroy)( + m_UniformCudaHipQueue)); + + } + + public: + dev::DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to. + ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue; + }; + + //############################################################################# + //! The CUDA RT blocking queue. + class QueueUniformCudaHipRtBase + : public concepts::Implements + , public concepts::Implements + , public concepts::Implements + { + public: + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST QueueUniformCudaHipRtBase( + dev::DevUniformCudaHipRt const & dev) : + m_spQueueImpl(std::make_shared(dev)) + {} + //----------------------------------------------------------------------------- + QueueUniformCudaHipRtBase(QueueUniformCudaHipRtBase const &) = default; + //----------------------------------------------------------------------------- + QueueUniformCudaHipRtBase(QueueUniformCudaHipRtBase &&) = default; + //----------------------------------------------------------------------------- + auto operator=(QueueUniformCudaHipRtBase const &) -> QueueUniformCudaHipRtBase & = default; + //----------------------------------------------------------------------------- + auto operator=(QueueUniformCudaHipRtBase &&) -> QueueUniformCudaHipRtBase & = default; + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRtBase const & rhs) const + -> bool + { + return (m_spQueueImpl == rhs.m_spQueueImpl); + } + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRtBase const & rhs) const + -> bool + { + return !((*this) == rhs); + } + //----------------------------------------------------------------------------- + ~QueueUniformCudaHipRtBase() = default; + + public: + std::shared_ptr m_spQueueImpl; + }; + } + } + } + + namespace dev + { + namespace traits + { + //############################################################################# + //! The CUDA/HIP RT non-blocking queue device get trait specialization. + template<> + struct GetDev< + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto getDev( + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const & queue) + -> dev::DevUniformCudaHipRt + { + return queue.m_spQueueImpl->m_dev; + } + }; + } + } + namespace queue + { + namespace traits + { + //############################################################################# + //! The CUDA/HIP RT blocking queue test trait specialization. + template<> + struct Empty< + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto empty( + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const & queue) + -> bool + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Query is allowed even for queues on non current device. + ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ret = ALPAKA_API_PREFIX(StreamQuery)(queue.m_spQueueImpl->m_UniformCudaHipQueue), + ALPAKA_API_PREFIX(ErrorNotReady)); + return (ret == ALPAKA_API_PREFIX(Success)); + } + }; + } + } + namespace wait + { + namespace traits + { + //############################################################################# + //! The CUDA/HIP RT blocking queue thread wait trait specialization. + //! + //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...) + template<> + struct CurrentThreadWaitFor< + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase> + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST static auto currentThreadWaitFor( + queue::uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const & queue) + -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Sync is allowed even for queues on non current device. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( + queue.m_spQueueImpl->m_UniformCudaHipQueue)); + } + }; + } + } +} + +#endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp index 23f0b12282..67123299e5 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp index e4448a9852..e13d3d7f8c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -77,6 +77,10 @@ namespace alpaka { } +#if BOOST_COMP_HIP + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST_ACC ~Xor() = default; +#endif //----------------------------------------------------------------------------- __device__ Xor( diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp index fb2582992d..14ff196306 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt index 7496ebe318..88bd89660f 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt @@ -1,6 +1,6 @@ /* Copyright 2019 Mutsuo Saito * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h index 52ada12142..182c250d7c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h @@ -1,6 +1,6 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Mutsuo Saito +/* Copyright 2019-2020 Axel Huebl, Benjamin Worpitz, Mutsuo Saito * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -41,13 +41,12 @@ #if BOOST_COMP_CLANG # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wold-style-cast" -# pragma clang diagnostic ignored "-Wsign-conversion" #endif #if BOOST_COMP_GNUC # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wsign-conversion" +# pragma GCC diagnostic ignored "-Wold-style-cast" #endif -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #pragma warning(push) #pragma warning(disable: 4100) // tinymt32.h(60): warning C4100: 'random': unreferenced formal parameter #endif @@ -114,8 +113,10 @@ inline static void tinymt32_next_state(tinymt32_t * random) { random->status[1] = random->status[2]; random->status[2] = x ^ (y << TINYMT32_SH1); random->status[3] = y; - random->status[1] ^= -((int32_t)(y & 1)) & random->mat1; - random->status[2] ^= -((int32_t)(y & 1)) & random->mat2; + int32_t const a = -((int32_t)(y & 1)) & (int32_t)random->mat1; + int32_t const b = -((int32_t)(y & 1)) & (int32_t)random->mat2; + random->status[1] ^= (uint32_t)a; + random->status[2] ^= (uint32_t)b; } /** @@ -135,7 +136,9 @@ inline static uint32_t tinymt32_temper(tinymt32_t * random) { + (random->status[2] >> TINYMT32_SH8); #endif t0 ^= t1; - t0 ^= -((int32_t)(t1 & 1)) & random->tmat; + if ((t1 & 1) != 0) { + t0 ^= random->tmat; + } return t0; } @@ -161,8 +164,11 @@ inline static float tinymt32_temper_conv(tinymt32_t * random) { + (random->status[2] >> TINYMT32_SH8); #endif t0 ^= t1; - conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9) - | UINT32_C(0x3f800000); + if ((t1 & 1) != 0) { + conv.u = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800000); + } else { + conv.u = (t0 >> 9) | UINT32_C(0x3f800000); + } return conv.f; } @@ -188,8 +194,11 @@ inline static float tinymt32_temper_conv_open(tinymt32_t * random) { + (random->status[2] >> TINYMT32_SH8); #endif t0 ^= t1; - conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9) - | UINT32_C(0x3f800001); + if ((t1 & 1) != 0) { + conv.u = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800001); + } else { + conv.u = (t0 >> 9) | UINT32_C(0x3f800001); + } return conv.f; } @@ -213,7 +222,7 @@ inline static uint32_t tinymt32_generate_uint32(tinymt32_t * random) { */ inline static float tinymt32_generate_float(tinymt32_t * random) { tinymt32_next_state(random); - return (tinymt32_temper(random) >> 8) * TINYMT32_MUL; + return (float)(tinymt32_temper(random) >> 8) * TINYMT32_MUL; } /** @@ -306,13 +315,13 @@ static uint32_t ini_func2(uint32_t x) { */ static void period_certification(tinymt32_t * random) { if ((random->status[0] & TINYMT32_MASK) == 0 && - random->status[1] == 0 && - random->status[2] == 0 && - random->status[3] == 0) { - random->status[0] = 'T'; - random->status[1] = 'I'; - random->status[2] = 'N'; - random->status[3] = 'Y'; + random->status[1] == 0 && + random->status[2] == 0 && + random->status[3] == 0) { + random->status[0] = 'T'; + random->status[1] = 'I'; + random->status[2] = 'N'; + random->status[3] = 'Y'; } } @@ -322,19 +331,19 @@ static void period_certification(tinymt32_t * random) { * @param random tinymt state vector. * @param seed a 32-bit unsigned integer used as a seed. */ -inline void tinymt32_init(tinymt32_t * random, uint32_t seed) { +void tinymt32_init(tinymt32_t * random, uint32_t seed) { random->status[0] = seed; random->status[1] = random->mat1; random->status[2] = random->mat2; random->status[3] = random->tmat; - for (uint32_t i = 1; i < MIN_LOOP; i++) { - random->status[i & 3] ^= i + UINT32_C(1812433253) - * (random->status[(i - 1) & 3] - ^ (random->status[(i - 1) & 3] >> 30)); + for (unsigned int i = 1; i < MIN_LOOP; i++) { + random->status[i & 3] ^= i + UINT32_C(1812433253) + * (random->status[(i - 1) & 3] + ^ (random->status[(i - 1) & 3] >> 30)); } period_certification(random); - for (int i = 0; i < PRE_LOOP; i++) { - tinymt32_next_state(random); + for (unsigned int i = 0; i < PRE_LOOP; i++) { + tinymt32_next_state(random); } } @@ -345,14 +354,13 @@ inline void tinymt32_init(tinymt32_t * random, uint32_t seed) { * @param init_key the array of 32-bit integers, used as a seed. * @param key_length the length of init_key. */ -inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[], - int key_length) { - const int lag = 1; - const int mid = 1; - const int size = 4; - uint32_t i; - int j; - int count; +void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[], + int key_length) { + const unsigned int lag = 1; + const unsigned int mid = 1; + const unsigned int size = 4; + unsigned int i, j; + unsigned int count; uint32_t r; uint32_t * st = &random->status[0]; @@ -361,50 +369,50 @@ inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[], st[2] = random->mat2; st[3] = random->tmat; if (key_length + 1 > MIN_LOOP) { - count = key_length + 1; + count = (unsigned int)key_length + 1; } else { - count = MIN_LOOP; + count = MIN_LOOP; } r = ini_func1(st[0] ^ st[mid % size] - ^ st[(size - 1) % size]); + ^ st[(size - 1) % size]); st[mid % size] += r; - r += uint32_t(key_length); + r += (unsigned int)key_length; st[(mid + lag) % size] += r; st[0] = r; count--; - for (i = 1, j = 0; (j < count) && (j < key_length); j++) { - r = ini_func1(st[i % size] - ^ st[(i + mid) % size] - ^ st[(i + size - 1) % size]); - st[(i + mid) % size] += r; - r += init_key[j] + i; - st[(i + mid + lag) % size] += r; - st[i % size] = r; - i = (i + 1) % size; + for (i = 1, j = 0; (j < count) && (j < (unsigned int)key_length); j++) { + r = ini_func1(st[i % size] + ^ st[(i + mid) % size] + ^ st[(i + size - 1) % size]); + st[(i + mid) % size] += r; + r += init_key[j] + i; + st[(i + mid + lag) % size] += r; + st[i % size] = r; + i = (i + 1) % size; } for (; j < count; j++) { - r = ini_func1(st[i % size] - ^ st[(i + mid) % size] - ^ st[(i + size - 1) % size]); - st[(i + mid) % size] += r; - r += i; - st[(i + mid + lag) % size] += r; - st[i % size] = r; - i = (i + 1) % size; + r = ini_func1(st[i % size] + ^ st[(i + mid) % size] + ^ st[(i + size - 1) % size]); + st[(i + mid) % size] += r; + r += i; + st[(i + mid + lag) % size] += r; + st[i % size] = r; + i = (i + 1) % size; } for (j = 0; j < size; j++) { - r = ini_func2(st[i % size] - + st[(i + mid) % size] - + st[(i + size - 1) % size]); - st[(i + mid) % size] ^= r; - r -= i; - st[(i + mid + lag) % size] ^= r; - st[i % size] = r; - i = (i + 1) % size; + r = ini_func2(st[i % size] + + st[(i + mid) % size] + + st[(i + size - 1) % size]); + st[(i + mid) % size] ^= r; + r -= i; + st[(i + mid + lag) % size] ^= r; + st[i % size] = r; + i = (i + 1) % size; } period_certification(random); for (i = 0; i < PRE_LOOP; i++) { - tinymt32_next_state(random); + tinymt32_next_state(random); } } @@ -417,7 +425,7 @@ inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[], #if BOOST_COMP_GNUC # pragma GCC diagnostic pop #endif -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) # pragma warning(pop) #endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp index f1d528be8b..5a3db167be 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,8 +12,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp index 2a8180f98e..26d7f7653d 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp index 1cecf21bf9..c8529a9c1b 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp index c3cf763e37..9fe7c502ca 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp index f93665dc57..27fd0da8dd 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp index 7a4ab7013e..191fe3c7c9 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp index 4c0f7ae0db..d123a6bd7c 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp index 791ef4b4cf..7150f5a3db 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp index 2648e0c222..56c67f1802 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp index f322b14c89..a69378c990 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp index a4bad6edfe..e0eecfdcf8 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp index 4cacf5584d..7aabc003e3 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp index e1ecf0b1ce..4ee0088084 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp index 4a5604b33f..77415f8787 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp index 528b42835a..ef48b96b38 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,8 +16,6 @@ #include #include -#include - #include namespace alpaka diff --git a/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp b/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp index 48c72463fc..088909cadc 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -25,21 +25,12 @@ #include #include -#include - #include #include #include #include #include -// Some compilers do not support the out of class versions: -// - the nvcc CUDA compiler (at least 8.0) -// - the intel compiler -#if BOOST_COMP_HIP || BOOST_COMP_NVCC || BOOST_COMP_INTEL || (BOOST_COMP_CLANG_CUDA >= BOOST_VERSION_NUMBER(4, 0, 0)) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 0, 0)) - #define ALPAKA_CREATE_VEC_IN_CLASS -#endif - namespace alpaka { namespace vec @@ -49,7 +40,6 @@ namespace alpaka typename TVal> class Vec; -#ifndef ALPAKA_CREATE_VEC_IN_CLASS //----------------------------------------------------------------------------- //! Single value constructor helper. ALPAKA_NO_HOST_ACC_WARNING @@ -108,7 +98,6 @@ namespace alpaka IdxSubSequence(), std::forward(args)...); } -#endif //############################################################################# //! A n-dimensional vector. @@ -121,7 +110,6 @@ namespace alpaka static_assert(TDim::value >= 0u, "Invalid dimensionality"); using Dim = TDim; - static constexpr auto s_uiDim = TDim::value; using Val = TVal; private: @@ -160,64 +148,6 @@ namespace alpaka m_data{std::forward(arg0), std::forward(args)...} {} -#ifdef ALPAKA_CREATE_VEC_IN_CLASS - //----------------------------------------------------------------------------- - //! Creator using func(args...) to initialize all values of the vector. - ALPAKA_NO_HOST_ACC_WARNING - template< - template class TTFnObj, - typename... TArgs, - typename TIdxSize, - TIdxSize... TIndices> - ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnArbitrary( - std::integer_sequence const & indices, - TArgs && ... args) - -> Vec - { - alpaka::ignore_unused(indices); - - return Vec( - (TTFnObj::create(std::forward(args)...))...); - } - //----------------------------------------------------------------------------- - //! Creator using func(args...) to initialize all values of the vector. - //! The idx is in the range [0, TDim]. - ALPAKA_NO_HOST_ACC_WARNING - template< - template class TTFnObj, - typename... TArgs> - ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFn( - TArgs && ... args) - -> Vec - { - return - createVecFromIndexedFnArbitrary< - TTFnObj>( - IdxSequence(), - std::forward(args)...); - } - //----------------------------------------------------------------------------- - //! Creator using func(args...) to initialize all values of the vector. - //! The idx is in the range [TIdxOffset, TIdxOffset + TDim]. - ALPAKA_NO_HOST_ACC_WARNING - template< - template class TTFnObj, - typename TIdxOffset, - typename... TArgs> - ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnOffset( - TArgs && ... args) - -> Vec - { - using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset; - using IdxSubSequence = meta::ConvertIntegerSequence; - return - createVecFromIndexedFnArbitrary< - TTFnObj>( - IdxSubSequence(), - std::forward(args)...); - } -#endif - //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC @@ -225,7 +155,7 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC - Vec(Vec &&) = default; + Vec(Vec &&) noexcept = default; //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC @@ -233,7 +163,7 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC - auto operator=(Vec &&) -> Vec & = default; + auto operator=(Vec &&) noexcept -> Vec & = default; //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC ~Vec() = default; @@ -267,9 +197,7 @@ namespace alpaka { return createVecFromIndexedFn< -#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, -#endif CreateSingleVal>( val); } @@ -378,7 +306,7 @@ namespace alpaka IdxSequence()); } // suppress strange warning produced by nvcc+MSVC in release mode -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #pragma warning(push) #pragma warning(disable: 4702) // unreachable code #endif @@ -390,7 +318,7 @@ namespace alpaka { return foldrAll(std::multiplies()); } -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #pragma warning(pop) #endif //----------------------------------------------------------------------------- @@ -449,67 +377,14 @@ namespace alpaka TVal m_data[TDim::value == 0u ? 1u : TDim::value]; }; - //----------------------------------------------------------------------------- - //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers. - //! Depending of the compiler conformance, the internal or external factory function is called. - //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary. - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal, - template class TTFnObj, - typename... TArgs> - ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnWorkaround( - TArgs && ... args) - { - return - alpaka::vec:: -#ifdef ALPAKA_CREATE_VEC_IN_CLASS - Vec::template -#endif - createVecFromIndexedFn< -#ifndef ALPAKA_CREATE_VEC_IN_CLASS - TDim, -#endif - TTFnObj>( - std::forward(args)...); - } - - //----------------------------------------------------------------------------- - //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers. - //! Depending of the compiler conformance, the internal or external factory function is called. - //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary. - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal, - template class TTFnObj, - typename TIdxOffset, - typename... TArgs> - ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffsetWorkaround( - TArgs && ... args) - { - return - alpaka::vec:: -#ifdef ALPAKA_CREATE_VEC_IN_CLASS - Vec::template -#endif - createVecFromIndexedFnOffset< -#ifndef ALPAKA_CREATE_VEC_IN_CLASS - TDim, -#endif - TTFnObj, - TIdxOffset>( - std::forward(args)...); - } - namespace detail { //############################################################################# - //! A function object that returns the sum of the two input vectors elements. + //! This is used to create a Vec by applying a binary operation onto the corresponding elements of two input vectors. template< + template class TFnObj, std::size_t Tidx> - struct CreateAdd + struct CreateVecByApplyingBinaryFnToTwoIndexedVecs { //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING @@ -519,12 +394,19 @@ namespace alpaka ALPAKA_FN_HOST_ACC static auto create( Vec const & p, Vec const & q) - -> TVal { - return p[Tidx] + q[Tidx]; + return TFnObj()(p[Tidx], q[Tidx]); } }; } + + namespace detail + { + template< + std::size_t Tidx> + using CreateVecFromTwoIndexedVecsPlus = CreateVecByApplyingBinaryFnToTwoIndexedVecs; + } + //----------------------------------------------------------------------------- //! \return The element-wise sum of two vectors. ALPAKA_NO_HOST_ACC_WARNING @@ -537,35 +419,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - TVal, - detail::CreateAdd>( + detail::CreateVecFromTwoIndexedVecsPlus>( p, q); } namespace detail { - //################################################################################## - //! A function object that returns the difference of the two input vectors elements. template< std::size_t Tidx> - struct CreateSub - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> TVal - { - return p[Tidx] - q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsMinus = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -580,35 +445,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - TVal, - detail::CreateSub>( + detail::CreateVecFromTwoIndexedVecsMinus>( p, q); } namespace detail { - //############################################################################# - //! A function object that returns the product of the two input vectors elements. template< std::size_t Tidx> - struct CreateMul - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> TVal - { - return p[Tidx] * q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsMul = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -623,35 +471,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - TVal, - detail::CreateMul>( + detail::CreateVecFromTwoIndexedVecsMul>( p, q); } namespace detail { - //############################################################################# - //! A function object that returns the element-wise less than relation of two vectors. template< std::size_t Tidx> - struct CreateLess - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> bool - { - return p[Tidx] < q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsLess = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -666,35 +497,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - bool, - detail::CreateLess>( + detail::CreateVecFromTwoIndexedVecsLess>( p, q); } namespace detail { - //############################################################################# - //! A function object that returns the element-wise less than or equal relation of two vectors. template< std::size_t Tidx> - struct CreateLessEqual - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> bool - { - return p[Tidx] <= q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsLessEqual = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -709,35 +523,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - bool, - detail::CreateLessEqual>( + detail::CreateVecFromTwoIndexedVecsLessEqual>( p, q); } namespace detail { - //############################################################################# - //! A function object that returns the element-wise greater than or equal relation of two vectors. template< std::size_t Tidx> - struct CreateGreaterEqual - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> bool - { - return p[Tidx] >= q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsGreaterEqual = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -752,35 +549,18 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - bool, - detail::CreateGreaterEqual>( + detail::CreateVecFromTwoIndexedVecsGreaterEqual>( p, q); } namespace detail { - //############################################################################# - //! A function object that returns the element-wise greater than relation of two vectors. template< std::size_t Tidx> - struct CreateGreater - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING - template< - typename TDim, - typename TVal> - ALPAKA_FN_HOST_ACC static auto create( - Vec const & p, - Vec const & q) - -> bool - { - return p[Tidx] > q[Tidx]; - } - }; + using CreateVecFromTwoIndexedVecsGreater = CreateVecByApplyingBinaryFnToTwoIndexedVecs; } //----------------------------------------------------------------------------- @@ -795,10 +575,9 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - bool, - detail::CreateGreater>( + detail::CreateVecFromTwoIndexedVecsGreater>( p, q); } @@ -963,9 +742,8 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - TSizeNew, vec::detail::CreateCast>( TSizeNew(), vec); @@ -1030,9 +808,8 @@ namespace alpaka -> Vec { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< TDim, - TVal, vec::detail::CreateReverse>( vec); } @@ -1097,9 +874,8 @@ namespace alpaka -> Vec, TVal> { return - createVecFromIndexedFnWorkaround< + createVecFromIndexedFn< dim::DimInt, - TVal, vec::detail::CreateConcat>( vecL, vecR); @@ -1141,9 +917,8 @@ namespace alpaka -> vec::Vec, idx::Idx> { return - vec::createVecFromIndexedFnWorkaround< + vec::createVecFromIndexedFn< dim::Dim, - idx::Idx, detail::CreateExtent>( extent); } @@ -1160,9 +935,8 @@ namespace alpaka { using IdxOffset = std::integral_constant(dim::Dim::value) - static_cast(TDim::value)>; return - vec::createVecFromIndexedFnOffsetWorkaround< + vec::createVecFromIndexedFnOffset< TDim, - idx::Idx, detail::CreateExtent, IdxOffset>( extent); @@ -1202,9 +976,8 @@ namespace alpaka -> vec::Vec, idx::Idx> { return - vec::createVecFromIndexedFnWorkaround< + vec::createVecFromIndexedFn< dim::Dim, - idx::Idx, detail::CreateOffset>( offsets); } @@ -1221,9 +994,8 @@ namespace alpaka { using IdxOffset = std::integral_constant(static_cast(dim::Dim::value) - static_cast(TDim::value))>; return - vec::createVecFromIndexedFnOffsetWorkaround< + vec::createVecFromIndexedFnOffset< TDim, - idx::Idx, detail::CreateOffset, IdxOffset>( offsets); diff --git a/thirdParty/cupla/alpaka/include/alpaka/version.hpp b/thirdParty/cupla/alpaka/include/alpaka/version.hpp index afa0a317da..cb2db90227 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/version.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/version.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp index 89b1290b4e..5426a1a7eb 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp new file mode 100644 index 0000000000..f84206436f --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp @@ -0,0 +1,200 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace alpaka +{ + //----------------------------------------------------------------------------- + //! The thread warp specifics + namespace warp + { + struct ConceptWarp{}; + + //----------------------------------------------------------------------------- + //! The warp traits. + namespace traits + { + //############################################################################# + //! The warp size trait. + template< + typename TWarp, + typename TSfinae = void> + struct GetSize; + + //############################################################################# + //! The all warp vote trait. + template< + typename TWarp, + typename TSfinae = void> + struct All; + + //############################################################################# + //! The any warp vote trait. + template< + typename TWarp, + typename TSfinae = void> + struct Any; + + //############################################################################# + //! The ballot warp vote trait. + template< + typename TWarp, + typename TSfinae = void> + struct Ballot; + + //############################################################################# + //! The active mask trait. + template< + typename TWarp, + typename TSfinae = void> + struct Activemask; + } + + //----------------------------------------------------------------------------- + //! Returns warp size. + //! + //! \tparam TWarp The warp implementation type. + //! \param warp The warp implementation. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TWarp> + ALPAKA_FN_ACC auto getSize( + TWarp const & warp) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase< + ConceptWarp, + TWarp>; + return traits::GetSize< + ImplementationBase> + ::getSize( + warp); + } + + //----------------------------------------------------------------------------- + //! Returns a 32- or 64-bit unsigned integer (depending on the + //! accelerator) whose Nth bit is set if and only if the Nth thread + //! of the warp is active. + //! + //! Note: decltype for return type is required there, otherwise + //! compilcation with a CPU and a GPU accelerator enabled fails as it + //! tries to call device function from a host-device one. The reason + //! is unclear, but likely related to deducing the return type. + //! + //! \tparam TWarp The warp implementation type. + //! \param warp The warp implementation. + //! \return 32-bit or 64-bit unsigned type depending on the accelerator. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TWarp> + ALPAKA_FN_ACC auto activemask( + TWarp const & warp) -> decltype(traits::Activemask< + concepts::ImplementationBase >::activemask(warp)) + { + using ImplementationBase = concepts::ImplementationBase< + ConceptWarp, + TWarp>; + return traits::Activemask< + ImplementationBase> + ::activemask( + warp); + } + + //----------------------------------------------------------------------------- + //! Evaluates predicate for all active threads of the warp and returns + //! non-zero if and only if predicate evaluates to non-zero for all of them. + //! + //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP, + //! the operation is applied for all active threads. + //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate). + //! + //! \tparam TWarp The warp implementation type. + //! \param warp The warp implementation. + //! \param predicate The predicate value for current thread. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TWarp> + ALPAKA_FN_ACC auto all( + TWarp const & warp, + std::int32_t predicate) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::All< + ImplementationBase> + ::all( + warp, + predicate); + } + + //----------------------------------------------------------------------------- + //! Evaluates predicate for all active threads of the warp and returns + //! non-zero if and only if predicate evaluates to non-zero for any of them. + //! + //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP, + //! the operation is applied for all active threads. + //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate). + //! + //! \tparam TWarp The warp implementation type. + //! \param warp The warp implementation. + //! \param predicate The predicate value for current thread. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TWarp> + ALPAKA_FN_ACC auto any( + TWarp const & warp, + std::int32_t predicate) + -> std::int32_t + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Any< + ImplementationBase> + ::any( + warp, + predicate); + } + + //----------------------------------------------------------------------------- + //! Evaluates predicate for all non-exited threads in a warp and returns + //! a 32- or 64-bit unsigned integer (depending on the accelerator) + //! whose Nth bit is set if and only if predicate evaluates to non-zero + //! for the Nth thread of the warp and the Nth thread is active. + //! + //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP, + //! the operation is applied for all active threads. + //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate). + //! Return type is 64-bit to fit all platforms. + //! + //! \tparam TWarp The warp implementation type. + //! \param warp The warp implementation. + //! \param predicate The predicate value for current thread. + //! \return 32-bit or 64-bit unsigned type depending on the accelerator. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TWarp> + ALPAKA_FN_ACC auto ballot( + TWarp const & warp, + std::int32_t predicate) + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Ballot< + ImplementationBase> + ::ballot( + warp, + predicate); + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp new file mode 100644 index 0000000000..c1105b56ec --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp @@ -0,0 +1,110 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#include + +namespace alpaka +{ + namespace warp + { + //############################################################################# + //! The single-threaded warp to emulate it on CPUs. + class WarpSingleThread : public concepts::Implements + { + public: + //----------------------------------------------------------------------------- + WarpSingleThread() = default; + //----------------------------------------------------------------------------- + WarpSingleThread(WarpSingleThread const &) = delete; + //----------------------------------------------------------------------------- + WarpSingleThread(WarpSingleThread &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(WarpSingleThread const &) -> WarpSingleThread & = delete; + //----------------------------------------------------------------------------- + auto operator=(WarpSingleThread &&) -> WarpSingleThread & = delete; + //----------------------------------------------------------------------------- + ~WarpSingleThread() = default; + }; + + namespace traits + { + //############################################################################# + template<> + struct GetSize< + WarpSingleThread> + { + //----------------------------------------------------------------------------- + static auto getSize( + warp::WarpSingleThread const & /*warp*/) + { + return 1; + } + }; + + //############################################################################# + template<> + struct Activemask< + WarpSingleThread> + { + //----------------------------------------------------------------------------- + static auto activemask( + warp::WarpSingleThread const & /*warp*/) + { + return 1u; + } + }; + + //############################################################################# + template<> + struct All< + WarpSingleThread> + { + //----------------------------------------------------------------------------- + static auto all( + warp::WarpSingleThread const & /*warp*/, + std::int32_t predicate) + { + return predicate; + } + }; + + //############################################################################# + template<> + struct Any< + WarpSingleThread> + { + //----------------------------------------------------------------------------- + static auto any( + warp::WarpSingleThread const & /*warp*/, + std::int32_t predicate) + { + return predicate; + } + }; + + //############################################################################# + template<> + struct Ballot< + WarpSingleThread> + { + //----------------------------------------------------------------------------- + static auto ballot( + warp::WarpSingleThread const & /*warp*/, + std::int32_t predicate) + { + return predicate ? 1u : 0u; + } + }; + } + } +} diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp new file mode 100644 index 0000000000..e41c9daa8a --- /dev/null +++ b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp @@ -0,0 +1,174 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) + +#include + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA + #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP + #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +#include +#include + +#include + +namespace alpaka +{ + namespace warp + { + //############################################################################# + //! The GPU CUDA/HIP warp. + class WarpUniformCudaHipBuiltIn : public concepts::Implements + { + public: + //----------------------------------------------------------------------------- + WarpUniformCudaHipBuiltIn() = default; + //----------------------------------------------------------------------------- + __device__ WarpUniformCudaHipBuiltIn(WarpUniformCudaHipBuiltIn const &) = delete; + //----------------------------------------------------------------------------- + __device__ WarpUniformCudaHipBuiltIn(WarpUniformCudaHipBuiltIn &&) = delete; + //----------------------------------------------------------------------------- + __device__ auto operator=(WarpUniformCudaHipBuiltIn const &) -> WarpUniformCudaHipBuiltIn & = delete; + //----------------------------------------------------------------------------- + __device__ auto operator=(WarpUniformCudaHipBuiltIn &&) -> WarpUniformCudaHipBuiltIn & = delete; + //----------------------------------------------------------------------------- + ~WarpUniformCudaHipBuiltIn() = default; + }; + + namespace traits + { + //############################################################################# + template<> + struct GetSize< + WarpUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto getSize( + warp::WarpUniformCudaHipBuiltIn const & /*warp*/) + -> std::int32_t + { + return warpSize; + } + }; + + //############################################################################# + template<> + struct Activemask< + WarpUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto activemask( + warp::WarpUniformCudaHipBuiltIn const & /*warp*/) +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + -> std::uint32_t +#else + -> std::uint64_t +#endif + { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + // Workaround for clang + CUDA 9.2 which uses the wrong PTX ISA, + // discussion in https://github.com/alpaka-group/alpaka/pull/1003 + // Can't use __activemask(), so emulate with __ballot_sync() + #if BOOST_COMP_CLANG_CUDA && BOOST_LANG_CUDA == BOOST_VERSION_NUMBER(9, 2, 0) + return __ballot_sync( + 0xffffffff, + 1); + #else + return __activemask(); + #endif +#else + // No HIP intrinsic for it, emulate via ballot + return __ballot(1); +#endif + } + }; + + //############################################################################# + template<> + struct All< + WarpUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto all( + warp::WarpUniformCudaHipBuiltIn const & warp, + std::int32_t predicate) + -> std::int32_t + { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __all_sync( + activemask(warp), + predicate); +#else + ignore_unused(warp); + return __all(predicate); +#endif + } + }; + + //############################################################################# + template<> + struct Any< + WarpUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto any( + warp::WarpUniformCudaHipBuiltIn const & warp, + std::int32_t predicate) + -> std::int32_t + { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __any_sync( + activemask(warp), + predicate); +#else + ignore_unused(warp); + return __any(predicate); +#endif + } + }; + + //############################################################################# + template<> + struct Ballot< + WarpUniformCudaHipBuiltIn> + { + //----------------------------------------------------------------------------- + __device__ static auto ballot( + warp::WarpUniformCudaHipBuiltIn const & warp, + std::int32_t predicate) + // return type is required by the compiler +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + -> std::uint32_t +#else + -> std::uint64_t +#endif + { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __ballot_sync( + activemask(warp), + predicate); +#else + ignore_unused(warp); + return __ballot(predicate); +#endif + } + }; + } + } +} + +#endif diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp index 81aa34fb9d..b6da4f60fb 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,8 +16,6 @@ #include #include -#include - #include #include diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp index c923a61b7d..dc6fc2bf77 100644 --- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp +++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -235,7 +235,7 @@ namespace alpaka // For equal block thread extent this is easily the nth root of blockThreadCountMax. if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent) { - double const fNthRoot(std::pow(blockThreadCountMax, 1.0/static_cast(TDim::value))); + double const fNthRoot(std::pow(static_cast(blockThreadCountMax), 1.0/static_cast(TDim::value))); TIdx const nthRoot(static_cast(fNthRoot)); for(typename TDim::value_type i(0u); i= 17 )) then - if [ "${CXX}" == "clang++" ] - then - if (( "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}" < 7 )) - then - echo "Clang used in c++17 mode requires libstdc++-7 or newer." - exit 1 - fi - fi if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ] then if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 67 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) )) diff --git a/thirdParty/cupla/alpaka/script/ci.sh b/thirdParty/cupla/alpaka/script/ci.sh index 5dd020faae..a7c3dfe04e 100755 --- a/thirdParty/cupla/alpaka/script/ci.sh +++ b/thirdParty/cupla/alpaka/script/ci.sh @@ -3,7 +3,7 @@ # # Copyright 2018-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -15,11 +15,10 @@ source ./script/set.sh ./script/print_env.sh source ./script/before_install.sh -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then - ./script/docker_install.sh - ./script/docker_run.sh -elif [ "$TRAVIS_OS_NAME" = "windows" ] || [ "$TRAVIS_OS_NAME" = "osx" ] + ./script/docker_ci.sh +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then ./script/install.sh ./script/run.sh diff --git a/thirdParty/cupla/alpaka/script/docker_run.sh b/thirdParty/cupla/alpaka/script/docker_ci.sh similarity index 85% rename from thirdParty/cupla/alpaka/script/docker_run.sh rename to thirdParty/cupla/alpaka/script/docker_ci.sh index 9b89d83af9..20c1ee52a2 100755 --- a/thirdParty/cupla/alpaka/script/docker_run.sh +++ b/thirdParty/cupla/alpaka/script/docker_ci.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,22 +16,17 @@ source ./script/set.sh ALPAKA_DOCKER_ENV_LIST=() ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}") ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}") +ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_OS_NAME=${ALPAKA_CI_OS_NAME}") ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}") ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}") ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}") ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}") if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ] then ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}") fi ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}") -if [ ! -z "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}") -fi ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}") ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}") if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ] @@ -98,6 +93,8 @@ then ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}") ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}") fi +ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}") +ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_FIBERS=${ALPAKA_CI_INSTALL_FIBERS}") # runtime only options ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI=${ALPAKA_CI}") @@ -146,13 +143,4 @@ then ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION}") fi -docker images -docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME} - -# If we have created the image in the current run, we do not have to load it again, because it is already available. -if [[ "$(docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME} 2> /dev/null)" == "" ]]; then - gzip -dc "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" | docker load -fi - -# --cap-add SYS_PTRACE is required for LSAN to work -docker run --cap-add SYS_PTRACE -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" --rm "${ALPAKA_CI_DOCKER_IMAGE_NAME}" /bin/bash ./script/run.sh +docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash -c "./script/install.sh && ./script/run.sh" diff --git a/thirdParty/cupla/alpaka/script/docker_install.sh b/thirdParty/cupla/alpaka/script/docker_install.sh deleted file mode 100755 index 555d7bb3e3..0000000000 --- a/thirdParty/cupla/alpaka/script/docker_install.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash - -# -# Copyright 2017-2019 Benjamin Worpitz -# -# This file is part of Alpaka. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# - -source ./script/set.sh - -ls "${ALPAKA_CI_DOCKER_CACHE_DIR}" - -ALPAKA_DOCKER_BUILD_REQUIRED=1 - -if [ -f "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" ] -then - # NOTE: The image being available is not the only precondition. If anything within any of the scripts has changed in comparison to the ones that created the docker image, we might have to rebuild the image. - ALPAKA_DOCKER_BUILD_REQUIRED=0 -fi - -# runtime and compile time options -ALPAKA_DOCKER_ENV_LIST=() -ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}") -if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}") -fi -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}") -if [ ! -z ${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x} ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}") -fi -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}") -if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}") -fi -if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_BT_OMP4_ENABLE=${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}") -fi -if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}") -fi -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}") -if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}") - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}") - if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ] - then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}") - fi -fi -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}") -if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ] -then - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}") - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_BRANCH=${ALPAKA_CI_HIP_BRANCH}") - ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}") -fi -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}") -ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_FIBERS=${ALPAKA_CI_INSTALL_FIBERS}") - -if [ "${ALPAKA_DOCKER_BUILD_REQUIRED}" -eq 1 ] -then - docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash ./script/install.sh - - ALPAKA_DOCKER_CONTAINER_NAME=$(docker ps -l -q) - docker commit "${ALPAKA_DOCKER_CONTAINER_NAME}" "${ALPAKA_CI_DOCKER_IMAGE_NAME}" - - # delete the container and the base image to save disc space - docker stop "${ALPAKA_DOCKER_CONTAINER_NAME}" - docker rm "${ALPAKA_DOCKER_CONTAINER_NAME}" - docker rmi "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" - - docker save "${ALPAKA_CI_DOCKER_IMAGE_NAME}" | gzip > "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" - - docker images -fi diff --git a/thirdParty/cupla/alpaka/script/install.sh b/thirdParty/cupla/alpaka/script/install.sh index 0cfde1eae6..533c249878 100755 --- a/thirdParty/cupla/alpaka/script/install.sh +++ b/thirdParty/cupla/alpaka/script/install.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,35 +19,41 @@ source ./script/set.sh : ${ALPAKA_CI_INSTALL_HIP?"ALPAKA_CI_INSTALL_HIP must be specified"} : ${ALPAKA_CI_INSTALL_TBB?"ALPAKA_CI_INSTALL_TBB must be specified"} -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then travis_retry apt-get -y --quiet update travis_retry apt-get -y install sudo + # tzdata is installed by software-properties-common but it requires some special handling + if [[ "$(cat /etc/os-release)" == *"20.04"* ]] + then + export DEBIAN_FRONTEND=noninteractive + travis_retry sudo apt-get --quiet --allow-unauthenticated --no-install-recommends install tzdata + fi + # software-properties-common: 'add-apt-repository' and certificates for wget https download # binutils: ld # xz-utils: xzcat travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install software-properties-common wget git make binutils xz-utils fi -if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "windows" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then ./script/install_cmake.sh fi -if [ "$TRAVIS_OS_NAME" = "linux" ] -then - if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/install_analysis.sh ;fi -fi +if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/install_analysis.sh ;fi # Install CUDA before installing gcc as it installs gcc-4.8 and overwrites our selected compiler if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then ./script/install_cuda.sh ;fi -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then if [ "${CXX}" == "g++" ] ;then ./script/install_gcc.sh ;fi if [ "${CXX}" == "clang++" ] ;then source ./script/install_clang.sh ;fi - if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ] ;then ./script/install_hip.sh ;fi +elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ] +then + sudo xcode-select -s "/Applications/Xcode_${ALPAKA_CI_XCODE_VER}.app/Contents/Developer" fi if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ] @@ -57,10 +63,3 @@ fi ./script/install_boost.sh -if [ "$TRAVIS_OS_NAME" = "linux" ] -then - # Minimize docker image size - sudo apt-get --quiet --purge autoremove - sudo apt-get clean - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -fi diff --git a/thirdParty/cupla/alpaka/script/install_analysis.sh b/thirdParty/cupla/alpaka/script/install_analysis.sh index 27bec475b8..2ade9f2288 100755 --- a/thirdParty/cupla/alpaka/script/install_analysis.sh +++ b/thirdParty/cupla/alpaka/script/install_analysis.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,12 +14,28 @@ source ./script/travis_retry.sh source ./script/set.sh -#------------------------------------------------------------------------------- -# Install sloc -travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install sloccount -sloccount --version +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] +then + #------------------------------------------------------------------------------- + # Install sloc + travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install sloccount + sloccount --version -#------------------------------------------------------------------------------- -# Install shellcheck -travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install shellcheck -shellcheck --version + #------------------------------------------------------------------------------- + # Install shellcheck + travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install shellcheck + shellcheck --version + +elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ] +then + #------------------------------------------------------------------------------- + # Install sloc + brew install sloccount + sloccount --version + + #------------------------------------------------------------------------------- + # Install shellcheck + brew install shellcheck + shellcheck --version + +fi diff --git a/thirdParty/cupla/alpaka/script/install_boost.sh b/thirdParty/cupla/alpaka/script/install_boost.sh index 033e63fa6c..af7e47bf1d 100755 --- a/thirdParty/cupla/alpaka/script/install_boost.sh +++ b/thirdParty/cupla/alpaka/script/install_boost.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,7 +14,7 @@ source ./script/set.sh : "${BOOST_ROOT?'BOOST_ROOT must be specified'}" : "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}" -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}" fi @@ -23,7 +23,7 @@ fi : "${CC?'CC must be specified'}" : "${ALPAKA_CI_INSTALL_FIBERS?'ALPAKA_CI_INSTALL_FIBERS must be specified'}" : "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}" -if [ "$TRAVIS_OS_NAME" = "windows" ] +if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then : "${ALPAKA_CI_CL_VER?'ALPAKA_CI_CL_VER must be specified'}" fi @@ -31,7 +31,7 @@ fi git clone -b "${ALPAKA_CI_BOOST_BRANCH}" --quiet --recursive --single-branch --depth 1 https://github.com/boostorg/boost.git "${BOOST_ROOT}" # Bootstrap boost. -if [ "$TRAVIS_OS_NAME" = "windows" ] +if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then (cd "${BOOST_ROOT}"; ./bootstrap.bat) else @@ -40,7 +40,7 @@ fi (cd "${BOOST_ROOT}"; cat ./bootstrap.log) # Create file links. -if [ "$TRAVIS_OS_NAME" = "windows" ] +if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then (cd "${BOOST_ROOT}"; ./b2 headers) else @@ -58,19 +58,19 @@ then ALPAKA_BOOST_B2_CFLAGS="" ALPAKA_BOOST_B2_CXXFLAGS="" - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then ALPAKA_BOOST_B2+="sudo " fi ALPAKA_BOOST_B2+="./b2 -j1" - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then ALPAKA_BOOST_B2_CFLAGS+="-fPIC" ALPAKA_BOOST_B2_CXXFLAGS+="-fPIC" fi - if [ "$TRAVIS_OS_NAME" = "windows" ] + if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then ALPAKA_BOOST_B2+=" --layout=versioned" if [ "$ALPAKA_CI_CL_VER" = "2017" ] @@ -87,7 +87,7 @@ then # TODO: Win32: adress-model=32 ALPAKA_BOOST_B2+=" architecture=x86 address-model=64 link=static threading=multi runtime-link=shared" - if [ "$TRAVIS_OS_NAME" = "windows" ] + if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then ALPAKA_BOOST_B2+=" define=_CRT_NONSTDC_NO_DEPRECATE define=_CRT_SECURE_NO_DEPRECATE define=_SCL_SECURE_NO_DEPRECAT define=BOOST_USE_WINFIBERS define=_ENABLE_EXTENDED_ALIGNED_STORAGE" fi @@ -109,12 +109,19 @@ then # If the variable is not set, the backend will most probably be used by default so we install it. if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ] then - if [ "$TRAVIS_OS_NAME" = "linux" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then ALPAKA_BOOST_B2_CXXFLAGS+=" -std=c++14" fi ALPAKA_BOOST_B2+=" --with-fiber --with-context --with-thread --with-atomic --with-system --with-chrono --with-date_time" fi + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] + then + if [ "${ALPAKA_CI_STDLIB}" == "libc++" ] + then + ALPAKA_BOOST_B2_CXXFLAGS+=" -stdlib=libc++" + fi + fi if [ "${ALPAKA_BOOST_B2_CFLAGS}" != "" ] then ALPAKA_BOOST_B2+=' cflags="' @@ -125,17 +132,10 @@ then then ALPAKA_BOOST_B2+=' cxxflags="' ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CXXFLAGS}" - if [ "$TRAVIS_OS_NAME" = "linux" ] - then - if [ "${ALPAKA_CI_STDLIB}" == "libc++" ] - then - ALPAKA_BOOST_B2+=" -stdlib=libc++" - fi - fi ALPAKA_BOOST_B2+='"' fi - if [ "$TRAVIS_OS_NAME" = "linux" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then if [ "${ALPAKA_CI_STDLIB}" == "libc++" ] then @@ -150,7 +150,7 @@ then (cd "${BOOST_ROOT}"; eval "${ALPAKA_BOOST_B2}") # Clean the intermediate build files. - if [ "$TRAVIS_OS_NAME" = "windows" ] + if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then rm -rf bin.v2 else diff --git a/thirdParty/cupla/alpaka/script/install_clang.sh b/thirdParty/cupla/alpaka/script/install_clang.sh index fea8ac97f1..b7f6f4f0bf 100755 --- a/thirdParty/cupla/alpaka/script/install_clang.sh +++ b/thirdParty/cupla/alpaka/script/install_clang.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,49 +14,28 @@ source ./script/travis_retry.sh source ./script/set.sh -: "${ALPAKA_CI_CLANG_DIR?'ALPAKA_CI_CLANG_DIR must be specified'}" : "${ALPAKA_CI_CLANG_VER?'ALPAKA_CI_CLANG_VER must be specified'}" -: "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION?'ALPAKA_CI_CLANG_LIBSTDCPP_VERSION must be specified'}" : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}" : "${CXX?'CXX must be specified'}" -if [ -z "$(ls -A "${ALPAKA_CI_CLANG_DIR}")" ] -then - ALPAKA_CLANG_PKG_FILE_NAME=clang+llvm-${ALPAKA_CI_CLANG_VER}-x86_64-linux-gnu-ubuntu-16.04.tar.xz - travis_retry wget --no-verbose "http://llvm.org/releases/${ALPAKA_CI_CLANG_VER}/${ALPAKA_CLANG_PKG_FILE_NAME}" - mkdir -p "${ALPAKA_CI_CLANG_DIR}" - xzcat "${ALPAKA_CLANG_PKG_FILE_NAME}" | tar -xf - --strip 1 -C "${ALPAKA_CI_CLANG_DIR}" - sudo rm -rf "${ALPAKA_CLANG_PKG_FILE_NAME}" -fi -"${ALPAKA_CI_CLANG_DIR}/bin/llvm-config" --version -export LLVM_CONFIG="${ALPAKA_CI_CLANG_DIR}/bin/llvm-config" +travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install clang-${ALPAKA_CI_CLANG_VER} -travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -travis_retry sudo apt-get -y --quiet update - -travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libstdc++-"${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}"-dev if [ "${ALPAKA_CI_STDLIB}" == "libc++" ] then + travis_retry sudo apt-get -y --quiet update travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++-dev travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++abi-dev fi -travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libiomp-dev -sudo update-alternatives --install /usr/bin/clang clang "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50 -sudo update-alternatives --install /usr/bin/clang++ clang++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50 -sudo update-alternatives --install /usr/bin/cc cc "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50 -sudo update-alternatives --install /usr/bin/c++ c++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50 -# We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used. -export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH} -if [ -z ${LD_LIBRARY_PATH+x} ] -then - LD_LIBRARY_PATH= -fi -export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH} -if [ -z ${CPPFLAGS+x} ] + +if [ "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}" = "ON" ] || [ "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}" = "ON" ] || [ "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}" = "ON" ] then - CPPFLAGS= + travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libomp-dev fi -export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}" + +sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50 +sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50 +sudo update-alternatives --install /usr/bin/cc cc /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50 +sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50 which "${CXX}" ${CXX} -v diff --git a/thirdParty/cupla/alpaka/script/install_cmake.sh b/thirdParty/cupla/alpaka/script/install_cmake.sh index 2e9a5988d4..57880a1dcf 100755 --- a/thirdParty/cupla/alpaka/script/install_cmake.sh +++ b/thirdParty/cupla/alpaka/script/install_cmake.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -17,7 +17,7 @@ source ./script/set.sh : "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}" : "${ALPAKA_CI_CMAKE_VER?'ALPAKA_CI_CMAKE_VER must be specified'}" -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then # Download the selected version. if [ -z "$(ls -A ${ALPAKA_CI_CMAKE_DIR})" ] @@ -34,7 +34,7 @@ then sudo cp -fR "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"/* "${ALPAKA_CI_CMAKE_DIR}" sudo rm -rf "${ALPAKA_CMAKE_PKG_FILE_NAME}" "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}" fi -elif [ "$TRAVIS_OS_NAME" = "windows" ] +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then choco uninstall cmake.install choco install cmake.install --no-progress --version ${ALPAKA_CI_CMAKE_VER} diff --git a/thirdParty/cupla/alpaka/script/install_cuda.sh b/thirdParty/cupla/alpaka/script/install_cuda.sh index 79c698399f..619020dd2e 100755 --- a/thirdParty/cupla/alpaka/script/install_cuda.sh +++ b/thirdParty/cupla/alpaka/script/install_cuda.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -16,14 +16,17 @@ source ./script/set.sh : "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}" -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then - : "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME?'ALPAKA_CI_DOCKER_BASE_IMAGE_NAME must be specified'}" : "${ALPAKA_CI_CUDA_DIR?'ALPAKA_CI_CUDA_DIR must be specified'}" : "${ALPAKA_CUDA_COMPILER?'ALPAKA_CUDA_COMPILER must be specified'}" # Ubuntu 18.04 requires some extra keys for verification - if [[ "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" == *"18.04"* ]] + if [[ "$(cat /etc/os-release)" == *"18.04"* ]] + then + travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent + travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 + elif [[ "$(cat /etc/os-release)" == *"20.04"* ]] then travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 @@ -60,8 +63,13 @@ then ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-2-local ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.2.89-440.33.01_1.0-1_amd64.deb ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME} + elif [ "${ALPAKA_CUDA_VERSION}" == "11.0" ] + then + ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-11-0-local + ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.0.2-450.51.05-1_amd64.deb + ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/11.0.2/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME} else - echo CUDA versions other than 9.0, 9.1, 9.2, 10.0, 10.1 and 10.2 are not currently supported on linux! + echo CUDA versions other than 9.0, 9.1, 9.2, 10.0, 10.1, 10.2 and 11.0 are not currently supported on linux! fi if [ -z "$(ls -A ${ALPAKA_CI_CUDA_DIR})" ] then @@ -73,9 +81,14 @@ then travis_retry sudo apt-get -y --quiet update # Install CUDA - # Currently we do not install CUDA fully: sudo apt-get --quiet -y install cuda - # We only install the minimal packages. Because of our manual partial installation we have to create a symlink at /usr/local/cuda - sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-core-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" cuda-curand-"${ALPAKA_CUDA_VERSION}" cuda-curand-dev-"${ALPAKA_CUDA_VERSION}" + if [ "${ALPAKA_CUDA_VERSION}" == "11.0" ] + then + sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-compiler-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" libcurand-"${ALPAKA_CUDA_VERSION}" libcurand-dev-"${ALPAKA_CUDA_VERSION}" + else + # Currently we do not install CUDA fully: sudo apt-get --quiet -y install cuda + # We only install the minimal packages. Because of our manual partial installation we have to create a symlink at /usr/local/cuda + sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-core-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" cuda-curand-"${ALPAKA_CUDA_VERSION}" cuda-curand-dev-"${ALPAKA_CUDA_VERSION}" + fi sudo ln -s /usr/local/cuda-"${ALPAKA_CUDA_VERSION}" /usr/local/cuda if [ "${ALPAKA_CUDA_COMPILER}" == "clang" ] @@ -86,7 +99,7 @@ then # clean up sudo rm -rf "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}" sudo dpkg --purge "${ALPAKA_CUDA_PKG_DEB_NAME}" -elif [ "$TRAVIS_OS_NAME" = "windows" ] +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then if [ "${ALPAKA_CUDA_VERSION}" == "10.0" ] then diff --git a/thirdParty/cupla/alpaka/script/install_doxygen.sh b/thirdParty/cupla/alpaka/script/install_doxygen.sh index 9aa6b63f21..85aa67096b 100755 --- a/thirdParty/cupla/alpaka/script/install_doxygen.sh +++ b/thirdParty/cupla/alpaka/script/install_doxygen.sh @@ -3,7 +3,7 @@ # # Copyright 2020 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/script/install_gcc.sh b/thirdParty/cupla/alpaka/script/install_gcc.sh index d37e62384e..5a840b806f 100755 --- a/thirdParty/cupla/alpaka/script/install_gcc.sh +++ b/thirdParty/cupla/alpaka/script/install_gcc.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/script/install_hip.sh b/thirdParty/cupla/alpaka/script/install_hip.sh deleted file mode 100755 index ddba56b0ee..0000000000 --- a/thirdParty/cupla/alpaka/script/install_hip.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# -# Copyright 2018-2019 Benjamin Worpitz -# -# This file is part of Alpaka. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# - -source ./script/set.sh - -: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}" -: "${ALPAKA_CI_HIP_BRANCH?'ALPAKA_CI_HIP_BRANCH must be specified'}" -: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}" -: "${CXX?'CXX must be specified'}" -: "${CC?'CC must be specified'}" -: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}" - -# CMake -export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH} -cmake --version - -HIP_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-hip/ - -git clone -b "${ALPAKA_CI_HIP_BRANCH}" --quiet --recursive --single-branch https://github.com/ROCm-Developer-Tools/HIP.git "${HIP_SOURCE_DIR}" -(cd "${HIP_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_TESTING=OFF .. && make && make install) - - -## rocRAND -export HIP_PLATFORM=nvcc -export HIP_RUNTIME=nvcc -export ROCRAND_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-rocrand/ -if [ ! -d "${ROCRAND_SOURCE_DIR}" ] -then - # install it into the HIP install dir - git clone --quiet --recursive https://github.com/ROCmSoftwarePlatform/rocRAND "${ROCRAND_SOURCE_DIR}" - (cd "${ROCRAND_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DNVGPU_TARGETS="30" -DCMAKE_MODULE_PATH="${ALPAKA_CI_HIP_ROOT_DIR}/cmake" -DHIP_PLATFORM="${HIP_PLATFORM}" .. && make && make install) -fi diff --git a/thirdParty/cupla/alpaka/script/install_tbb.sh b/thirdParty/cupla/alpaka/script/install_tbb.sh index 99fa573f6d..4d18594748 100755 --- a/thirdParty/cupla/alpaka/script/install_tbb.sh +++ b/thirdParty/cupla/alpaka/script/install_tbb.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -15,14 +15,14 @@ source ./script/travis_retry.sh source ./script/set.sh # Install TBB -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtbb-dev -elif [ "$TRAVIS_OS_NAME" = "osx" ] +elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then brew unlink python@2 brew install tbb -elif [ "$TRAVIS_OS_NAME" = "windows" ] +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then TBB_ARCHIVE_VER="tbb44_20160526oss" TBB_DOWNLOAD_URL="https://github.com/intel/tbb/releases/download/4.4.5/${TBB_ARCHIVE_VER}_win.zip" diff --git a/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh b/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh index 7c9bddfbeb..fedacea00f 100755 --- a/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh +++ b/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/script/print_env.sh b/thirdParty/cupla/alpaka/script/print_env.sh index 4694d0cd22..0c2b4ea213 100755 --- a/thirdParty/cupla/alpaka/script/print_env.sh +++ b/thirdParty/cupla/alpaka/script/print_env.sh @@ -3,48 +3,33 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # +source ./script/travis_retry.sh + source ./script/set.sh #------------------------------------------------------------------------------- -if [ "$ALPAKA_CI" = "TRAVIS" ] -then - # Print the travis environment variables: http://docs.travis-ci.com/user/ci-environment/ - echo TRAVIS_BRANCH: "${TRAVIS_BRANCH}" - echo TRAVIS_BUILD_DIR: "${TRAVIS_BUILD_DIR}" - echo TRAVIS_BUILD_ID: "${TRAVIS_BUILD_ID}" - echo TRAVIS_BUILD_NUMBER: "${TRAVIS_BUILD_NUMBER}" - echo TRAVIS_COMMIT: "${TRAVIS_COMMIT}" - echo TRAVIS_COMMIT_RANGE: "${TRAVIS_COMMIT_RANGE}" - echo TRAVIS_JOB_ID: "${TRAVIS_JOB_ID}" - echo TRAVIS_JOB_NUMBER: "${TRAVIS_JOB_NUMBER}" - echo TRAVIS_PULL_REQUEST: "${TRAVIS_PULL_REQUEST}" - echo TRAVIS_SECURE_ENV_VARS: "${TRAVIS_SECURE_ENV_VARS}" - echo TRAVIS_REPO_SLUG: "${TRAVIS_REPO_SLUG}" - echo TRAVIS_OS_NAME: "${TRAVIS_OS_NAME}" - echo TRAVIS_TAG: "${TRAVIS_TAG}" -elif [ "$ALPAKA_CI" = "GITHUB" ] +if [ "$ALPAKA_CI" = "GITHUB" ] then echo GITHUB_WORKSPACE: "${GITHUB_WORKSPACE}" fi -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then # Show all running services sudo service --status-all # Stop some unnecessary services to save memory sudo /etc/init.d/mysql stop - sudo /etc/init.d/postgresql stop - sudo /etc/init.d/redis-server stop # Show memory stats + travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install smem sudo smem sudo free -m -t fi diff --git a/thirdParty/cupla/alpaka/script/push_doc.sh b/thirdParty/cupla/alpaka/script/push_doc.sh index cac08d09ba..4bcfb7f2cf 100755 --- a/thirdParty/cupla/alpaka/script/push_doc.sh +++ b/thirdParty/cupla/alpaka/script/push_doc.sh @@ -3,7 +3,7 @@ # # Copyright 2020 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,7 +14,7 @@ source ./script/travis_retry.sh source ./script/set.sh -cd doc/doxygen/html +cd docs/doxygen/html git config --global user.email "action@github.com" git config --global user.name "GitHub Action" diff --git a/thirdParty/cupla/alpaka/script/run.sh b/thirdParty/cupla/alpaka/script/run.sh index 27f22544ac..e0e7ae03c4 100755 --- a/thirdParty/cupla/alpaka/script/run.sh +++ b/thirdParty/cupla/alpaka/script/run.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -18,7 +18,7 @@ echo "ALPAKA_CI_CMAKE_DIR: ${ALPAKA_CI_CMAKE_DIR}" echo "ALPAKA_CI_ANALYSIS: ${ALPAKA_CI_ANALYSIS}" : "${ALPAKA_CI_INSTALL_CUDA?'ALPAKA_CI_INSTALL_CUDA must be specified'}" : "${ALPAKA_CI_INSTALL_HIP?'ALPAKA_CI_INSTALL_HIP must be specified'}" -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}" echo "ALPAKA_CI_STDLIB: ${ALPAKA_CI_STDLIB}" @@ -27,7 +27,7 @@ fi echo "CXX: ${CXX}" -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then if [ -z "${LD_LIBRARY_PATH+x}" ] then @@ -36,14 +36,14 @@ then fi # CMake -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH} fi cmake --version #TBB -if [ "$TRAVIS_OS_NAME" = "windows" ] +if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then #ALPAKA_TBB_BIN_DIR="${TBB_ROOT}/bin/ia32/vc14" ALPAKA_TBB_BIN_DIR="${TBB_ROOT}/bin/intel64/vc14" @@ -55,7 +55,7 @@ if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] then : "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}" - if [ "$TRAVIS_OS_NAME" = "linux" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then # CUDA export PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/bin:$PATH @@ -68,7 +68,7 @@ then which nvcc nvcc -V fi - elif [ "$TRAVIS_OS_NAME" = "windows" ] + elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then export PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}\bin":$PATH export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}" @@ -82,25 +82,14 @@ then # HIP # HIP_PATH required by HIP tools - export HIP_PATH=${ALPAKA_CI_HIP_ROOT_DIR} - # CUDA_PATH required by HIP tools - if [ -n "$(command -v nvcc)" ] - then - export CUDA_PATH=$(dirname $(which nvcc))/../ - else - export CUDA_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION} - fi + export HIP_PATH=/opt/rocm export PATH=${HIP_PATH}/bin:$PATH export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${HIP_PATH}/hiprand/lib:${LD_LIBRARY_PATH} export CMAKE_PREFIX_PATH=${HIP_PATH}:${HIP_PATH}/hiprand:${CMAKE_PREFIX_PATH:-} - # to avoid "use of uninitialized value .." warnings in perl script hipcc - # TODO: rely on CI vars for platform and architecture - export HIP_PLATFORM=nvcc - export HIP_RUNTIME=nvcc # calls nvcc or clang which hipcc - hipcc -V + hipcc --version which hipconfig hipconfig --platform hipconfig -v @@ -109,21 +98,8 @@ then fi -# clang -if [ "${CXX}" == "clang++" ] -then - # We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used. - export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH} - export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH} - if [ -z "${CPPFLAGS+x}" ] - then - CPPFLAGS= - fi - export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}" -fi - # stdlib -if [ "$TRAVIS_OS_NAME" = "linux" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] then if [ "${ALPAKA_CI_STDLIB}" == "libc++" ] then @@ -144,9 +120,28 @@ then ${CXX} -v source ./script/prepare_sanitizers.sh - if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/run_analysis.sh ;fi fi -./script/run_build.sh +if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] +then + : ${ALPAKA_CI_CL_VER?"ALPAKA_CI_CL_VER must be specified"} + + # Use the 64 bit compiler + # FIXME: Path not found but does not seem to be necessary anymore + #"./C/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Auxiliary/Build/vcvarsall.bat" amd64 + + # Add msbuild to the path + if [ "$ALPAKA_CI_CL_VER" = "2017" ] + then + export MSBUILD_EXECUTABLE="/C/Program Files (x86)/Microsoft Visual Studio/2017/Enterprise/MSBuild/15.0/Bin/MSBuild.exe" + elif [ "$ALPAKA_CI_CL_VER" = "2019" ] + then + export MSBUILD_EXECUTABLE=$(vswhere.exe -latest -requires Microsoft.Component.MSBuild -find "MSBuild\**\Bin\MSBuild.exe") + fi + "$MSBUILD_EXECUTABLE" -version +fi +./script/run_generate.sh +./script/run_build.sh if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./script/run_tests.sh ;fi +if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/run_analysis.sh ;fi diff --git a/thirdParty/cupla/alpaka/script/run_analysis.sh b/thirdParty/cupla/alpaka/script/run_analysis.sh index a4f369ec10..add4b06232 100755 --- a/thirdParty/cupla/alpaka/script/run_analysis.sh +++ b/thirdParty/cupla/alpaka/script/run_analysis.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,16 +12,19 @@ source ./script/set.sh -#------------------------------------------------------------------------------- -# sloc -sloccount . +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] +then + #------------------------------------------------------------------------------- + # sloc + sloccount . -#------------------------------------------------------------------------------- -# TODO/FIXME/HACK -grep -r HACK ./* || true -grep -r FIXME ./* || true -grep -r TODO ./* || true + #------------------------------------------------------------------------------- + # TODO/FIXME/HACK + grep -r HACK ./* || true + grep -r FIXME ./* || true + grep -r TODO ./* || true -#------------------------------------------------------------------------------- -# check shell script with shellcheck -find . -type f -name "*.sh" -exec shellcheck {} \; + #------------------------------------------------------------------------------- + # check shell script with shellcheck + find . -type f -name "*.sh" -exec shellcheck {} \; +fi diff --git a/thirdParty/cupla/alpaka/script/run_build.sh b/thirdParty/cupla/alpaka/script/run_build.sh index ce7a410b4f..edd7bf88c9 100755 --- a/thirdParty/cupla/alpaka/script/run_build.sh +++ b/thirdParty/cupla/alpaka/script/run_build.sh @@ -3,7 +3,7 @@ # # Copyright 2014-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,113 +12,14 @@ source ./script/set.sh -#------------------------------------------------------------------------------- - -# create a cmake variable definition if an environment variable exists -# -# This function can not handle environment variables with spaces in its content. -# -# @param $1 cmake/environment variable name -# -# @result if $1 exists cmake variable definition else nothing is returned -# -# @code{.bash} -# FOO=ON -# echo "$(env2cmake FOO)" # returns "-DFOO=ON" -# echo "$(env2cmake BAR)" # returns nothing -# @endcode -function env2cmake() -{ - if [ ! -z "${1+x}" ] ; then - echo -n "-D$1=${!1}" - fi -} - -#------------------------------------------------------------------------------- -# Build and execute all tests. -if [ ! -z "${CMAKE_CXX_FLAGS+x}" ] -then - echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" -fi -if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ] -then - echo "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}" -fi -if [ ! -z "${KMP_DEVICE_THREAD_LIMIT+x}" ] -then - echo "KMP_DEVICE_THREAD_LIMIT=${KMP_DEVICE_THREAD_LIMIT}" -fi -if [ ! -z "${KMP_ALL_THREADS+x}" ] -then - echo "KMP_ALL_THREADS=${KMP_ALL_THREADS}" -fi -if [ ! -z "${KMP_TEAMS_THREAD_LIMIT+x}" ] -then - echo "KMP_TEAMS_THREAD_LIMIT=${KMP_TEAMS_THREAD_LIMIT}" -fi -if [ ! -z "${OMP_THREAD_LIMIT+x}" ] -then - echo "OMP_THREAD_LIMIT=${OMP_THREAD_LIMIT}" -fi -if [ ! -z "${OMP_NUM_THREADS+x}" ] -then - echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}" -fi - -mkdir -p build/ cd build/ -ALPAKA_CI_CMAKE_GENERATOR_PLATFORM="" -if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] -then - ALPAKA_CI_CMAKE_GENERATOR="Unix Makefiles" -elif [ "$TRAVIS_OS_NAME" = "windows" ] -then - # Use the 64 bit compiler - # FIXME: Path not found but does not seem to be necessary anymore - #"./C/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Auxiliary/Build/vcvarsall.bat" amd64 - - # Add msbuild to the path - if [ "$ALPAKA_CI_CL_VER" = "2017" ] - then - MSBUILD_EXECUTABLE="/C/Program Files (x86)/Microsoft Visual Studio/2017/Enterprise/MSBuild/15.0/Bin/MSBuild.exe" - elif [ "$ALPAKA_CI_CL_VER" = "2019" ] - then - MSBUILD_EXECUTABLE=$(vswhere.exe -latest -requires Microsoft.Component.MSBuild -find "MSBuild\**\Bin\MSBuild.exe") - fi - "$MSBUILD_EXECUTABLE" -version - - : ${ALPAKA_CI_CL_VER?"ALPAKA_CI_CL_VER must be specified"} - - # Select the generator - if [ "$ALPAKA_CI_CL_VER" = "2017" ] - then - ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 15 2017" - elif [ "$ALPAKA_CI_CL_VER" = "2019" ] - then - ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 16 2019" - fi - ALPAKA_CI_CMAKE_GENERATOR_PLATFORM="-A x64" -fi - -cmake -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM} \ - "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF \ - "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" \ - "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \ - "$(env2cmake ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \ - "$(env2cmake ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_BT_OMP4_ENABLE)" \ - "$(env2cmake TBB_ROOT)" \ - "$(env2cmake ALPAKA_ACC_GPU_CUDA_ENABLE)" "$(env2cmake ALPAKA_CUDA_VERSION)" "$(env2cmake ALPAKA_ACC_GPU_CUDA_ONLY_MODE)" "$(env2cmake ALPAKA_CUDA_ARCH)" "$(env2cmake ALPAKA_CUDA_COMPILER)" \ - "$(env2cmake ALPAKA_CUDA_FAST_MATH)" "$(env2cmake ALPAKA_CUDA_FTZ)" "$(env2cmake ALPAKA_CUDA_SHOW_REGISTER)" "$(env2cmake ALPAKA_CUDA_KEEP_FILES)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)" "$(env2cmake ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)" \ - "$(env2cmake ALPAKA_ACC_GPU_HIP_ENABLE)" "$(env2cmake ALPAKA_ACC_GPU_HIP_ONLY_MODE)" "$(env2cmake ALPAKA_HIP_PLATFORM)" \ - "$(env2cmake ALPAKA_DEBUG)" "$(env2cmake ALPAKA_CI)" "$(env2cmake ALPAKA_CI_ANALYSIS)" "$(env2cmake ALPAKA_CXX_STANDARD)" \ - ".." -if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then make VERBOSE=1 -elif [ "$TRAVIS_OS_NAME" = "windows" ] +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then - "$MSBUILD_EXECUTABLE" "alpaka.sln" -p:Configuration=${CMAKE_BUILD_TYPE} -maxcpucount:2 -verbosity:minimal + "$MSBUILD_EXECUTABLE" "alpaka.sln" -p:Configuration=${CMAKE_BUILD_TYPE} -maxcpucount:1 -verbosity:minimal fi cd .. diff --git a/thirdParty/cupla/alpaka/script/run_doxygen.sh b/thirdParty/cupla/alpaka/script/run_doxygen.sh index 5eff35b157..13bcf29625 100755 --- a/thirdParty/cupla/alpaka/script/run_doxygen.sh +++ b/thirdParty/cupla/alpaka/script/run_doxygen.sh @@ -3,7 +3,7 @@ # # Copyright 2020 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -24,15 +24,14 @@ source ./script/set.sh #- Clean the branch: `git rm -rf .` #- Commit and push the branch: `git add --all`, `git commit -m"add gh-pages branch"`, `git push` -# Clone the gh-pages branch into the doc/doxygen/html folder. -git clone -b gh-pages https://x-access-token:${2}@github.com/${1}.git doc/doxygen/html +# Clone the gh-pages branch into the docs/doxygen/html folder. +git clone -b gh-pages https://x-access-token:${2}@github.com/${1}.git docs/doxygen/html -cd doc/doxygen/html +cd docs/ -rm -rf * - -cd .. +rm -rf doxygen/html/* +rm -rf doxygen/xml/* doxygen Doxyfile -cd ../.. +cd ../ diff --git a/thirdParty/cupla/alpaka/script/run_generate.sh b/thirdParty/cupla/alpaka/script/run_generate.sh new file mode 100755 index 0000000000..51bd26c4b2 --- /dev/null +++ b/thirdParty/cupla/alpaka/script/run_generate.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# +# Copyright 2014-2019 Benjamin Worpitz +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +source ./script/set.sh + +#------------------------------------------------------------------------------- + +# create a cmake variable definition if an environment variable exists +# +# This function can not handle environment variables with spaces in its content. +# +# @param $1 cmake/environment variable name +# +# @result if $1 exists cmake variable definition else nothing is returned +# +# @code{.bash} +# FOO=ON +# echo "$(env2cmake FOO)" # returns "-DFOO=ON" +# echo "$(env2cmake BAR)" # returns nothing +# @endcode +function env2cmake() +{ + if [ ! -z "${1+x}" ] ; then + echo -n "-D$1=${!1}" + fi +} + +#------------------------------------------------------------------------------- +if [ ! -z "${CMAKE_CXX_FLAGS+x}" ] +then + echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" +fi +if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ] +then + echo "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}" +fi + +ALPAKA_CI_CMAKE_EXECUTABLE=cmake +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] +then + ALPAKA_CI_CMAKE_EXECUTABLE="${ALPAKA_CI_CMAKE_DIR}/bin/cmake" +fi + +ALPAKA_CI_CMAKE_GENERATOR_PLATFORM="" +if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] +then + ALPAKA_CI_CMAKE_GENERATOR="Unix Makefiles" +elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] +then + : ${ALPAKA_CI_CL_VER?"ALPAKA_CI_CL_VER must be specified"} + + # Select the generator + if [ "$ALPAKA_CI_CL_VER" = "2017" ] + then + ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 15 2017" + elif [ "$ALPAKA_CI_CL_VER" = "2019" ] + then + ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 16 2019" + fi + ALPAKA_CI_CMAKE_GENERATOR_PLATFORM="-A x64" +fi + +mkdir -p build/ +cd build/ + +"${ALPAKA_CI_CMAKE_EXECUTABLE}" -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM} \ + -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON \ + "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF -DBoost_ARCHITECTURE="-x64" \ + "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" \ + "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \ + "$(env2cmake ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \ + "$(env2cmake ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_BT_OMP4_ENABLE)" \ + "$(env2cmake TBB_ROOT)" \ + "$(env2cmake ALPAKA_ACC_GPU_CUDA_ENABLE)" "$(env2cmake ALPAKA_CUDA_VERSION)" "$(env2cmake ALPAKA_ACC_GPU_CUDA_ONLY_MODE)" "$(env2cmake ALPAKA_CUDA_ARCH)" "$(env2cmake ALPAKA_CUDA_COMPILER)" \ + "$(env2cmake ALPAKA_CUDA_FAST_MATH)" "$(env2cmake ALPAKA_CUDA_FTZ)" "$(env2cmake ALPAKA_CUDA_SHOW_REGISTER)" "$(env2cmake ALPAKA_CUDA_KEEP_FILES)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)" "$(env2cmake ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)" \ + "$(env2cmake ALPAKA_ACC_GPU_HIP_ENABLE)" "$(env2cmake ALPAKA_ACC_GPU_HIP_ONLY_MODE)" "$(env2cmake ALPAKA_HIP_PLATFORM)" \ + "$(env2cmake ALPAKA_EMU_MEMCPY3D)" \ + "$(env2cmake ALPAKA_DEBUG)" "$(env2cmake ALPAKA_CI)" "$(env2cmake ALPAKA_CI_ANALYSIS)" "$(env2cmake ALPAKA_CXX_STANDARD)" \ + ".." + +cd .. diff --git a/thirdParty/cupla/alpaka/script/run_tests.sh b/thirdParty/cupla/alpaka/script/run_tests.sh index 6c55b6bcde..2a6a0c00f0 100755 --- a/thirdParty/cupla/alpaka/script/run_tests.sh +++ b/thirdParty/cupla/alpaka/script/run_tests.sh @@ -3,7 +3,7 @@ # # Copyright 2017-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -15,14 +15,23 @@ source ./script/set.sh : "${ALPAKA_ACC_GPU_CUDA_ENABLE?'ALPAKA_ACC_GPU_CUDA_ENABLE must be specified'}" : "${ALPAKA_ACC_GPU_HIP_ENABLE?'ALPAKA_ACC_GPU_HIP_ENABLE must be specified'}" +if [ ! -z "${OMP_THREAD_LIMIT+x}" ] +then + echo "OMP_THREAD_LIMIT=${OMP_THREAD_LIMIT}" +fi +if [ ! -z "${OMP_NUM_THREADS+x}" ] +then + echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}" +fi + if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "OFF" ] && [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "OFF" ]; then cd build/ - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] + if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ] then ctest -V - elif [ "$TRAVIS_OS_NAME" = "windows" ] + elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] then ctest -V -C ${CMAKE_BUILD_TYPE} fi diff --git a/thirdParty/cupla/alpaka/script/set.sh b/thirdParty/cupla/alpaka/script/set.sh index 262b6a77e5..7f2172ec82 100755 --- a/thirdParty/cupla/alpaka/script/set.sh +++ b/thirdParty/cupla/alpaka/script/set.sh @@ -3,7 +3,7 @@ # # Copyright 2018-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/script/travis_retry.sh b/thirdParty/cupla/alpaka/script/travis_retry.sh index d29ab93241..3bf25d5e8c 100755 --- a/thirdParty/cupla/alpaka/script/travis_retry.sh +++ b/thirdParty/cupla/alpaka/script/travis_retry.sh @@ -2,7 +2,7 @@ # # Copyright 2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/CMakeLists.txt b/thirdParty/cupla/alpaka/test/CMakeLists.txt index e55fc93c6f..1663731a7f 100644 --- a/thirdParty/cupla/alpaka/test/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt b/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt index 10d61204a2..451db4de00 100644 --- a/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt b/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt index 87b162e111..641e9f7632 100644 --- a/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp b/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp index 11d6b7196f..bd7800d15d 100644 --- a/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp +++ b/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt b/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt index eba3065057..5b83466d64 100644 --- a/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2015-2020 Benjamin Worpitz, Axel Huebl # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp b/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp index 31734b3a0a..992e94f13d 100644 --- a/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp +++ b/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/CMakeLists.txt b/thirdParty/cupla/alpaka/test/common/CMakeLists.txt index 42e1a77d9b..1f3cc74181 100644 --- a/thirdParty/cupla/alpaka/test/common/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/common/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -35,6 +35,7 @@ target_compile_options(${_COMMON_TARGET_NAME} if(MSVC) target_compile_options(${_COMMON_TARGET_NAME} PUBLIC "/wd4996") # This function or variable may be unsafe. Consider using instead. + target_compile_options(${_COMMON_TARGET_NAME} PUBLIC "/bigobj") endif() if(ALPAKA_ACC_GPU_CUDA_ENABLE OR (ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc")) diff --git a/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake b/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake index 5b472aa64c..4d42e6b90e 100644 --- a/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake +++ b/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake @@ -1,7 +1,7 @@ # # Copyright 2014-2019 Benjamin Worpitz # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -18,6 +18,10 @@ TARGET_INCLUDE_DIRECTORIES( SYSTEM INTERFACE ${Boost_INCLUDE_DIRS}) +IF(ALPAKA_ACC_GPU_CUDA_ENABLE AND (ALPAKA_CUDA_COMPILER MATCHES "nvcc") AND (ALPAKA_CUDA_VERSION VERSION_GREATER_EQUAL 11.0)) + LIST(APPEND CUDA_NVCC_FLAGS -Wdefault-stream-launch -Werror=default-stream-launch) +ENDIF() + #MSVC IF(MSVC) # Force to always compile with W4 and WX diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp index e08fa9aaba..cdfc613a0a 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp index c7acd759d5..6196405894 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp index cc74121902..87de22a14c 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -15,50 +15,57 @@ namespace alpaka //! The test specifics. namespace test { - //############################################################################# - //! 1D: (5) - //! 2D: (5, 4) - //! 3D: (5, 4, 3) - //! 4D: (5, 4, 3, 2) - // We have to be careful with the extents used. - // When TIdx is a 8 bit signed integer and Dim is 4, the extent is extremely limited. template< - std::size_t Tidx> - struct CreateExtentBufVal + typename TIdx> + struct CreateVecWithIdx { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING + //############################################################################# + //! 1D: (11) + //! 2D: (11, 10) + //! 3D: (11, 10, 9) + //! 4D: (11, 10, 9, 8) template< - typename TIdx> - ALPAKA_FN_HOST_ACC - static auto create( - TIdx) - -> TIdx + std::size_t Tidx> + struct ForExtentBuf { - return static_cast(5u - Tidx); - } - }; + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST_ACC static auto create() + { + return static_cast(11u - Tidx); + } + }; - //############################################################################# - //! 1D: (4) - //! 2D: (4, 3) - //! 3D: (4, 3, 2) - //! 4D: (4, 3, 2, 1) - template< - std::size_t Tidx> - struct CreateExtentViewVal - { - //----------------------------------------------------------------------------- - ALPAKA_NO_HOST_ACC_WARNING + //############################################################################# + //! 1D: (8) + //! 2D: (8, 6) + //! 3D: (8, 6, 4) + //! 4D: (8, 6, 4, 2) + template< + std::size_t Tidx> + struct ForExtentSubView + { + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST_ACC static auto create() + { + return static_cast(8u - (Tidx * 2u)); + } + }; + + //############################################################################# + //! 1D: (2) + //! 2D: (2, 3) + //! 3D: (2, 3, 4) + //! 4D: (2, 3, 4, 5) template< - typename TIdx> - ALPAKA_FN_HOST_ACC - static auto create( - TIdx) - -> TIdx + std::size_t Tidx> + struct ForOffset { - return static_cast(4u - Tidx); - } + //----------------------------------------------------------------------------- + ALPAKA_FN_HOST_ACC static auto create() + { + return static_cast(2u + Tidx); + } + }; }; } } diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp index ea11697c0e..7dccb4f3aa 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -31,6 +31,7 @@ namespace alpaka using DevAcc = alpaka::dev::Dev; using PltfAcc = alpaka::pltf::Pltf; using QueueAcc = alpaka::test::queue::DefaultQueue; + using WorkDiv = alpaka::workdiv::WorkDivMembers; public: //----------------------------------------------------------------------------- @@ -50,6 +51,14 @@ namespace alpaka alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted)) {} //----------------------------------------------------------------------------- + KernelExecutionFixture( + WorkDiv const & workDiv) : + m_devHost(alpaka::pltf::getDevByIdx(0u)), + m_devAcc(alpaka::pltf::getDevByIdx(0u)), + m_queue(m_devAcc), + m_workDiv(workDiv) + {} + //----------------------------------------------------------------------------- template< typename TKernelFnObj, typename... TArgs> @@ -87,7 +96,7 @@ namespace alpaka alpaka::dev::DevCpu m_devHost; DevAcc m_devAcc; QueueAcc m_queue; - alpaka::workdiv::WorkDivMembers m_workDiv; + WorkDiv m_workDiv; }; } } diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp index b65aec096d..8016702aa6 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp index 07481635e6..08ab47f42a 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -244,24 +244,9 @@ namespace alpaka idx::TestIdxs >; - //############################################################################# - //! Transforms a std::tuple holding a dimension and a idx type to a fully instantiated accelerator. - //! - //! EnabledAccs = tuple, ..., AccN> - template< - typename TTestAccParamSet> - struct InstantiateEnabledAccsWithTestParamSetImpl - { - using type = - EnabledAccs< - std::tuple_element_t<0, TTestAccParamSet>, - std::tuple_element_t<1, TTestAccParamSet> - >; - }; - template< - typename TTestAccParamSet> - using InstantiateEnabledAccsWithTestParamSet = typename InstantiateEnabledAccsWithTestParamSetImpl::type; + typename TList> + using ApplyEnabledAccs = alpaka::meta::Apply; //############################################################################# //! A std::tuple containing std::tuple with fully instantiated accelerators. @@ -275,7 +260,7 @@ namespace alpaka using InstantiatedEnabledAccs = alpaka::meta::Transform< TestDimIdxTuples, - InstantiateEnabledAccsWithTestParamSet + ApplyEnabledAccs >; } diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp index b84431d0f5..148f050bc9 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -13,16 +13,6 @@ #include -// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure -// we have to dramatically reduce the number of tested combinations. -// Else the log length would be exceeded. -#if defined(ALPAKA_CI) - #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA \ - || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP - #define ALPAKA_CUDA_CI - #endif -#endif - namespace alpaka { namespace test @@ -34,9 +24,7 @@ namespace alpaka using TestDims = std::tuple< alpaka::dim::DimInt<1u> -#if !defined(ALPAKA_CUDA_CI) ,alpaka::dim::DimInt<2u> -#endif ,alpaka::dim::DimInt<3u> // The CUDA & HIP accelerators do not currently support 4D buffers and 4D acceleration. #if !(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp index e14d5ae1d9..b59f1603f0 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp index 0d860ad2b4..981a45ab3d 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp index 6d97097ce0..2cbdb7f210 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp index 46158e34ed..5fe819187f 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp index db48dea0c9..6c7c75d468 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp index 095984054e..6edaabfd68 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -33,14 +34,6 @@ #include #include -namespace alpaka -{ - namespace event - { - class EventCpu; - } -} - namespace alpaka { namespace queue @@ -117,7 +110,7 @@ namespace alpaka m_spQueueImpl(std::make_shared(dev)), m_spBlockingQueue(std::make_shared(dev)) { - dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl); + dev.registerQueue(m_spQueueImpl); } //----------------------------------------------------------------------------- QueueCpuOmp2Collective(QueueCpuOmp2Collective const &) = default; diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp index 8280f61005..225a76a990 100644 --- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp +++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp b/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp index dbe641fb23..e8e0a89082 100644 --- a/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp +++ b/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt index 6df80b52f9..7909c19ecb 100644 --- a/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt index 705611e74e..054adb6109 100644 --- a/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp b/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp index 8d18f8f87b..c8d24eccda 100644 --- a/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp +++ b/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt index 36ab673ef1..d517629346 100644 --- a/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp b/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp index 4b5fa2d9cf..e483004228 100644 --- a/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp +++ b/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt index 170462374d..58a49d4f0c 100644 --- a/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp b/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp index 4031357289..ea6d60834f 100644 --- a/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt index fa17965876..109d87206f 100644 --- a/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp b/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp index cda1f0855a..3b3e32fb84 100644 --- a/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp +++ b/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt index 3845a4361d..26a411070f 100644 --- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp index cc9c83838e..0c32c3fdb9 100644 --- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp +++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp index 317f4694b1..8726af651b 100644 --- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp +++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp index a5ea90a625..d7e5c67a63 100644 --- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp +++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt index d7a11dcb24..4a5d5eb7bb 100644 --- a/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp b/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp index e1e49b59a1..fd4f640150 100644 --- a/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp +++ b/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt index 69eabc3455..fbaceee4d2 100644 --- a/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2015-2020 Benjamin Worpitz, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this @@ -23,11 +23,14 @@ add_subdirectory("atomic/") add_subdirectory("block/shared/") add_subdirectory("block/sync/") add_subdirectory("core/") +add_subdirectory("dev/") add_subdirectory("event/") add_subdirectory("idx/") +add_subdirectory("intrinsic/") add_subdirectory("kernel/") -add_subdirectory("math/sincos/") +add_subdirectory("math/") add_subdirectory("mem/buf/") +add_subdirectory("mem/copy/") add_subdirectory("mem/view/") add_subdirectory("mem/p2p/") add_subdirectory("meta/") @@ -35,3 +38,5 @@ add_subdirectory("queue/") add_subdirectory("rand/") add_subdirectory("time/") add_subdirectory("vec/") +add_subdirectory("warp/") +add_subdirectory("workDiv/") diff --git a/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt index da9edb0d8a..1811ae105a 100644 --- a/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp b/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp new file mode 100644 index 0000000000..10a0e7b274 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp @@ -0,0 +1,36 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include + +#include + +#include + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "getAccDevProps", "[acc]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const devProps = alpaka::acc::getAccDevProps(dev); + + REQUIRE(devProps.m_gridBlockExtentMax.prod() > 0); + // Note: this causes signed overflow for some configurations, + // will be fixed separately + // REQUIRE(devProps.m_blockThreadExtentMax.prod() > 0); + REQUIRE(devProps.m_threadElemExtentMax.prod() > 0); + REQUIRE(devProps.m_gridBlockCountMax > 0); + REQUIRE(devProps.m_blockThreadCountMax > 0); + REQUIRE(devProps.m_threadElemCountMax > 0); + REQUIRE(devProps.m_multiProcessorCount > 0); + REQUIRE(devProps.m_sharedMemSizeBytes > 0); +} diff --git a/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp b/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp index 31e139f31c..90db216c41 100644 --- a/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt index c22a9512eb..b866527ac2 100644 --- a/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp b/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp index fc3c745272..92c00a36ef 100644 --- a/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -29,7 +29,7 @@ ALPAKA_FN_ACC auto testAtomicAdd( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -54,7 +54,7 @@ ALPAKA_FN_ACC auto testAtomicSub( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -79,7 +79,7 @@ ALPAKA_FN_ACC auto testAtomicMin( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -104,7 +104,7 @@ ALPAKA_FN_ACC auto testAtomicMax( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -129,7 +129,7 @@ ALPAKA_FN_ACC auto testAtomicExch( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -155,7 +155,7 @@ ALPAKA_FN_ACC auto testAtomicInc( -> void { // \TODO: Check reset to 0 at 'value'. - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(42); T const ret = @@ -181,7 +181,7 @@ ALPAKA_FN_ACC auto testAtomicDec( -> void { // \TODO: Check reset to 'value' at 0. - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(42); T const ret = @@ -206,7 +206,7 @@ ALPAKA_FN_ACC auto testAtomicAnd( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -231,7 +231,7 @@ ALPAKA_FN_ACC auto testAtomicOr( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = static_cast(4); T const ret = @@ -256,7 +256,7 @@ ALPAKA_FN_ACC auto testAtomicXor( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); operand = operandOrig; T const value = operandOrig + static_cast(4); T const ret = @@ -281,7 +281,7 @@ ALPAKA_FN_ACC auto testAtomicCas( T operandOrig) -> void { - auto && operand = alpaka::block::shared::st::allocVar(acc); + auto & operand = alpaka::block::shared::st::allocVar(acc); //----------------------------------------------------------------------------- // with match diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt index cbb9ae2b2d..192f287ab2 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp index 8b7a89c1fc..34f176b7ff 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp +++ b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -29,15 +29,15 @@ class BlockSharedMemDynTestKernel -> void { // Assure that the pointer is non null. - auto && a = alpaka::block::shared::dyn::getMem(acc); + auto a = alpaka::block::shared::dyn::getMem(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != a); // Each call should return the same pointer ... - auto && b = alpaka::block::shared::dyn::getMem(acc); + auto b = alpaka::block::shared::dyn::getMem(acc); ALPAKA_CHECK(*success, a == b); // ... even for different types. - auto && c = alpaka::block::shared::dyn::getMem(acc); + auto c = alpaka::block::shared::dyn::getMem(acc); ALPAKA_CHECK(*success, a == reinterpret_cast(c)); } }; diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp index 28f2625098..82727cf6d5 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp +++ b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp @@ -36,29 +36,29 @@ class BlockSharedMemStNonNullTestKernel // Multiple runs to make sure it really works. for(std::size_t i=0u; i<10; ++i) { - auto && a = alpaka::block::shared::st::allocVar(acc); + auto & a = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &a); - auto && b = alpaka::block::shared::st::allocVar(acc); + auto & b = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &b); - auto && c = alpaka::block::shared::st::allocVar(acc); + auto & c = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &c); - auto && d = alpaka::block::shared::st::allocVar(acc); + auto & d = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &d); - auto && e = alpaka::block::shared::st::allocVar(acc); + auto & e = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &e); - auto && f = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); + auto & f = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &f[0]); - auto && g = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); + auto & g = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &g[0]); - auto && h = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); + auto & h = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); ALPAKA_CHECK(*success, static_cast(nullptr) != &h[0]); } #if BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0) @@ -99,19 +99,19 @@ class BlockSharedMemStSameTypeDifferentAdressTestKernel // Multiple runs to make sure it really works. for(std::size_t i=0u; i<10; ++i) { - auto && a = alpaka::block::shared::st::allocVar(acc); - auto && b = alpaka::block::shared::st::allocVar(acc); + auto & a = alpaka::block::shared::st::allocVar(acc); + auto & b = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, &a != &b); - auto && c = alpaka::block::shared::st::allocVar(acc); + auto & c = alpaka::block::shared::st::allocVar(acc); ALPAKA_CHECK(*success, &b != &c); ALPAKA_CHECK(*success, &a != &c); ALPAKA_CHECK(*success, &b != &c); - auto && d = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); + auto & d = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); ALPAKA_CHECK(*success, &a != &d[0]); ALPAKA_CHECK(*success, &b != &d[0]); ALPAKA_CHECK(*success, &c != &d[0]); - auto && e = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); + auto & e = alpaka::block::shared::st::allocVar, __COUNTER__>(acc); ALPAKA_CHECK(*success, &a != &e[0]); ALPAKA_CHECK(*success, &b != &e[0]); ALPAKA_CHECK(*success, &c != &e[0]); diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt index 74c3dc152d..50c2348a62 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp index a5b6d90888..dae95d9e57 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp +++ b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp index 62c17299e2..6278f3543a 100644 --- a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp +++ b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt index 1f8d31a507..89e9d9742a 100644 --- a/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2018-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp index a7d188ceac..debdff1ad9 100644 --- a/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -43,6 +43,9 @@ TEST_CASE("printDefines", "[core]") #if BOOST_COMP_MSVC std::cout << "BOOST_COMP_MSVC:" << BOOST_COMP_MSVC << std::endl; #endif +#if defined(BOOST_COMP_MSVC_EMULATED) + std::cout << "BOOST_COMP_MSVC_EMULATED:" << BOOST_COMP_MSVC_EMULATED << std::endl; +#endif #if BOOST_COMP_CLANG_CUDA std::cout << "BOOST_COMP_CLANG_CUDA:" << BOOST_COMP_CLANG_CUDA << std::endl; #endif diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp index 19ec2792a6..29f270cca5 100644 --- a/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp index c4c4873657..f738aebb21 100644 --- a/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt similarity index 84% rename from thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt rename to thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt index f225679a59..55d8831066 100644 --- a/thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Matthias Werner, Jan Stephan +# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # # This file is part of Alpaka. # @@ -8,7 +8,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -set(_TARGET_NAME "sincos") +set(_TARGET_NAME "dev") append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) diff --git a/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp b/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp new file mode 100644 index 0000000000..a2ae087f2d --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp @@ -0,0 +1,24 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include + +#include + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "getWarpSize", "[dev]", alpaka::test::acc::TestAccs) +{ + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const warpExtent = alpaka::dev::getWarpSize(dev); + REQUIRE(warpExtent > 0); +} diff --git a/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt index fbb14f451f..055a388e68 100644 --- a/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp b/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp index af6b70c985..f84a1c35c3 100644 --- a/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt index d8a12ccf98..f1df3296dd 100644 --- a/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp index d6de1145ed..e4500f9ab6 100644 --- a/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp +++ b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -12,29 +12,10 @@ #include #include +#include #include -//############################################################################# -//! 1D: (17) -//! 2D: (17, 14) -//! 3D: (17, 14, 11) -//! 4D: (17, 14, 11, 8) -template< - std::size_t Tidx> -struct CreateExtentVal -{ - //----------------------------------------------------------------------------- - template< - typename TIdx> - ALPAKA_FN_HOST_ACC static auto create( - TIdx) - -> TIdx - { - return static_cast(17u - (Tidx*3u)); - } -}; - //----------------------------------------------------------------------------- TEMPLATE_LIST_TEST_CASE( "mapIdx", "[idx]", alpaka::test::dim::TestDims) { @@ -42,7 +23,7 @@ TEMPLATE_LIST_TEST_CASE( "mapIdx", "[idx]", alpaka::test::dim::TestDims) using Idx = std::size_t; using Vec = alpaka::vec::Vec; - auto const extentNd(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentNd(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto const idxNd(extentNd - Vec::all(4u)); auto const idx1d(alpaka::idx::mapIdx<1u>(idxNd, extentNd)); diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt new file mode 100644 index 0000000000..335bc65973 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "intrinsic") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit") + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS}) diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp new file mode 100644 index 0000000000..189c0e8746 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp @@ -0,0 +1,81 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include +#include + +//############################################################################# +template< + typename TInput> +class FfsTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + TInput const inputs[] = {0, 1, 3, 64, 256, 51362, + std::numeric_limits::max(), + -1, -32, -1352, -4096, std::numeric_limits::min()}; + for( auto const input : inputs ) + { + std::int32_t const expected = ffsNaive(input); + std::int32_t const actual = alpaka::intrinsic::ffs(acc, input); + ALPAKA_CHECK(*success, actual == expected); + } + } + +private: + ALPAKA_FN_ACC static auto ffsNaive(TInput value) -> std::int32_t + { + if (value == 0) + return 0; + std::int32_t result = 1; + while ((value & 1) == 0) + { + value >>= 1; + result++; + } + return result; + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "ffs", "[intrinsic]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + FfsTestKernel kernel32bit; + REQUIRE( + fixture( + kernel32bit)); + + FfsTestKernel kernel64bit; + REQUIRE( + fixture( + kernel64bit)); +} diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp new file mode 100644 index 0000000000..81928c4699 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp @@ -0,0 +1,77 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +//############################################################################# +template< + typename TInput> +class PopcountTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + // Use negative values to get inputs near the max value of TInput type + TInput const inputs[] = {0u, 1u, 3u, 54u, 163u, 51362u, + static_cast(-43631), static_cast(-1352), + static_cast(-642), static_cast(-1)}; + for( auto const input : inputs ) + { + int const expected = popcountNaive(input); + int const actual = alpaka::intrinsic::popcount(acc, input); + ALPAKA_CHECK(*success, actual == expected); + } + } + +private: + ALPAKA_FN_ACC static auto popcountNaive(TInput value) -> int + { + int result = 0; + while (value) + { + result += static_cast(value & 1u); + value >>= 1u; + } + return result; + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "popcount", "[intrinsic]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + PopcountTestKernel kernel32bit; + REQUIRE( + fixture( + kernel32bit)); + + PopcountTestKernel kernel64bit; + REQUIRE( + fixture( + kernel64bit)); +} diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt index 2d16da0103..734c292bb3 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp index fd455095b3..ea17df39a2 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp index 07c4da01cb..bacbd6e1d8 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp index 0856ea3cd8..325c8e4283 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp index 8436adebea..3d44a559aa 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp index 1e6d347eb9..a080cb39ce 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp index 3759dbc095..89b78ed7a6 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -31,7 +31,7 @@ class KernelWithHostConstexpr { alpaka::ignore_unused(acc); -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #pragma warning(push) #pragma warning(disable: 4127) // warning C4127: conditional expression is constant #endif @@ -39,7 +39,7 @@ class KernelWithHostConstexpr constexpr auto max = std::numeric_limits< std::uint32_t >::max(); ALPAKA_CHECK(*success, 0 != max); -#if BOOST_COMP_MSVC +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) #pragma warning(pop) #endif } diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp index 7a78cf74bf..c746e218d0 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp index 4873e5763d..bc0b3eb4ba 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp index a855584e86..4e6f931d40 100644 --- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp +++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt new file mode 100644 index 0000000000..e3b99d6ce4 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt @@ -0,0 +1,45 @@ +# +# Copyright 2017-2019 Benjamin Worpitz, Jakob Krude +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "math") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) +append_recursive_files_add_to_src_group("src/" "src/" "hpp" _FILES_HEADER) + +if(ALPAKA_ACC_GPU_CUDA_ENABLE) + list(REMOVE_ITEM + CUDA_NVCC_FLAGS "--ftz=true" "--prec-div=false" "--prec-sqrt=false" "--fmad=true" "--use_fast_math" "-use_fast_math") + +endif() +if(ALPAKA_ACC_GPU_HIP_ENABLE) + list(REMOVE_ITEM + HIP_NVCC_FLAGS "--ftz=true" "--prec-div=false" "--prec-sqrt=false" "--fmad=true" "--use_fast_math" "-use_fast_math") +endif() + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE} + ${_FILES_HEADER}) +target_include_directories( + ${_TARGET_NAME} + PRIVATE ${Boost_INCLUDE_DIRS}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) +set_target_properties( + ${_TARGET_NAME} + PROPERTIES + COMPILE_OPTIONS + $<$:"-ffp-contract=off"> # ffp-contract: https://llvm.org/docs/CompileCudaWithLLVM.html#id5 + ) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit") + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS}) diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp new file mode 100644 index 0000000000..4a7b264a91 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp @@ -0,0 +1,153 @@ +/** Copyright 2019 Jakob Krude, Benjamin Worpitz + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include "Defines.hpp" + +#include + +#include + +namespace alpaka { +namespace test { +namespace unit { +namespace math { + +//! Provides alpaka-style buffer with arguments' data. +//! TData can be a plain value or a complex data-structure. +//! The operator() is overloaded and returns the value from the correct Buffer, +//! either from the host (index) or device buffer (index, acc). +//! Index out of range errors are not checked. +//! @brief Encapsulates buffer initialisation and communication with Device. +//! @tparam TAcc Used accelerator, not interchangeable +//! @tparam TData The Data-type, only restricted by the alpaka-interface. +//! @tparam Tcapacity The size of the buffer. +template< + typename TAcc, + typename TData, + size_t Tcapacity +> +struct Buffer +{ + using value_type = TData; + static constexpr size_t capacity = Tcapacity; + using Dim = typename alpaka::dim::traits::DimType::type; + using Idx = typename alpaka::idx::traits::IdxType::type; + + // Defines using's for alpaka-buffer. + using DevAcc = alpaka::dev::Dev< TAcc >; + using DevHost = alpaka::dev::DevCpu; + using PltfHost = alpaka::pltf::Pltf< DevHost >; + + using BufHost = alpaka::mem::buf::Buf< + DevHost, + TData, + Dim, + Idx + >; + using BufAcc = alpaka::mem::buf::Buf< + DevAcc, + TData, + Dim, + Idx + >; + + DevHost devHost; + + BufHost hostBuffer; + BufAcc devBuffer; + + // Native pointer to access buffer. + TData * const pHostBuffer; + TData * const pDevBuffer; + + + // This constructor cant be used, + // because BufHost and BufAcc need to be initialised. + Buffer( ) = delete; + + // Constructor needs to initialize all Buffer. + Buffer(const DevAcc & devAcc) + : + devHost{ alpaka::pltf::getDevByIdx< PltfHost >( 0u ) }, + hostBuffer + { + alpaka::mem::buf::alloc(devHost, Tcapacity) + }, + devBuffer + { + alpaka::mem::buf::alloc(devAcc, Tcapacity) + }, + pHostBuffer{ alpaka::mem::view::getPtrNative( hostBuffer ) }, + pDevBuffer{ alpaka::mem::view::getPtrNative( devBuffer ) } + {} + + // Copy Host -> Acc. + template< typename Queue > + auto copyToDevice( Queue queue ) -> void + { + alpaka::mem::view::copy( + queue, + devBuffer, + hostBuffer, + Tcapacity + ); + } + + // Copy Acc -> Host. + template< typename Queue > + auto copyFromDevice( Queue queue ) -> void + { + alpaka::mem::view::copy( + queue, + hostBuffer, + devBuffer, + Tcapacity + ); + } + + ALPAKA_FN_ACC + auto operator()( + size_t idx, + TAcc const & acc ) const -> TData& + { + alpaka::ignore_unused(acc); + return pDevBuffer[idx]; + } + + ALPAKA_FN_HOST + auto operator()( + size_t idx ) const -> TData& + { + return pHostBuffer[idx]; + } + + ALPAKA_FN_HOST + friend std::ostream & operator<<( + std::ostream & os, + const Buffer & buffer + ) + { + os << "capacity: " << capacity + << "\n"; + for( size_t i = 0; i < capacity; ++i ) + { + os << i + << ": " << buffer.pHostBuffer[i] + << "\n"; + } + return os; + } +}; + +} // math +} // unit +} // test +} // alpaka diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp new file mode 100644 index 0000000000..098ba7114d --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp @@ -0,0 +1,140 @@ +/** Copyright 2019 Jakob Krude, Benjamin Worpitz + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include "Defines.hpp" + +#include +#include +#include + +namespace alpaka { +namespace test { +namespace unit { +namespace math { + + + /** + * Fills buffer with random numbers (host-only). + * + * @tparam TData The used data-type (float || double). + * @tparam TArgs The args-buffer to be filled. + * @tparam TFunctor The used Functor-type. + * @param args The buffer that should be filled. + * @param functor The Functor, needed for ranges. + * @param seed The used seed. + */ + template< + typename TData, + typename TArgs, + typename TFunctor> + auto fillWithRndArgs( + TArgs & args, + TFunctor functor, + unsigned int const & seed + ) -> void + { + /* + * Each "sub-buffer" is filled with zero and/or max and/or lowest, + * depending on the specified range (at [0] - [2]). + * + * Every switch case needs to return! + * If no switch case was matched an assert(false) will be triggered. + * + * This function is easily extendable. It is only necessary to add extra + * definitions in the switch case, for more Range-types. + */ + static_assert( TArgs::value_type::arity == TFunctor::arity, + "Buffer properties must match TFunctor::arity" ); + static_assert( TArgs::capacity > 2, + "Set of args must provide > 2 entries." ); + constexpr auto max = std::numeric_limits< TData >::max(); + constexpr auto low = std::numeric_limits< TData >::lowest(); + std::default_random_engine eng{ + static_cast< std::default_random_engine::result_type >( seed ) }; + + // These pseudo-random numbers are implementation/platform specific! + std::uniform_real_distribution< TData > dist( 0, 1000 ); + std::uniform_real_distribution< TData > distOne( -1, 1 ); + for( size_t k = 0; k < TFunctor::arity_nr; ++k ) + { + bool matchedSwitch = false; + switch( functor.ranges[k] ) + { + case Range::OneNeighbourhood: + matchedSwitch = true; + for( size_t i = 0; i < TArgs::capacity; ++i ) + { + args( i ).arg[k] = distOne( eng ); + } + break; + + case Range::PositiveOnly: + matchedSwitch = true; + args( 0 ).arg[k] = max; + for( size_t i = 1; i < TArgs::capacity; ++i ) + { + args( i ).arg[k] = dist( eng ) + static_cast(1); + } + break; + + case Range::PositiveAndZero: + matchedSwitch = true; + args( 0 ).arg[k] = 0.0; + args( 1 ).arg[k] = max; + for( size_t i = 2; i < TArgs::capacity; ++i ) + { + args( i ).arg[k] = dist( eng ); + } + break; + + case Range::NotZero: + matchedSwitch = true; + args( 0 ).arg[k] = max; + args( 1 ).arg[k] = low; + for( size_t i = 2; i < TArgs::capacity; ++i ) + { + TData arg; + do + { + arg = dist( eng ); + } + while( std::equal_to()(arg,1) ); + if( i % 2 == 0 ) + args( i ).arg[k] = arg; + else + args( i ).arg[k] = -arg; + } + break; + + case Range::Unrestricted: + matchedSwitch = true; + args( 0 ).arg[k] = 0.0; + args( 1 ).arg[k] = max; + args( 2 ).arg[k] = low; + for( size_t i = 3; i < TArgs::capacity; ++i ) + { + if( i % 2 == 0 ) + args( i ).arg[k] = dist( eng ); + else + args( i ).arg[k] = -dist( eng ); + } + break; + } + // disable gcc-warning "unused variable" + alpaka::ignore_unused(matchedSwitch); + assert(matchedSwitch); + } + } + +} // math +} // unit +} // test +} // alpaka diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp new file mode 100644 index 0000000000..8109519036 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp @@ -0,0 +1,71 @@ +/** Copyright 2019 Jakob Krude, Benjamin Worpitz + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include +#include + +namespace alpaka { +namespace test { +namespace unit { +namespace math { + + // New types need to be added to the switch-case in DataGen.hpp + enum class Range + { + OneNeighbourhood, + PositiveOnly, + PositiveAndZero, + NotZero, + Unrestricted + }; + + // New types need to be added to the operator() function in Functor.hpp + enum class Arity + { + Unary = 1, + Binary = 2 + }; + + template + struct ArgsItem{ + static constexpr Arity arity = Tarity; + static constexpr size_t arity_nr = static_cast(Tarity); + + T arg[arity_nr]; // represents arg0, arg1, ... + + friend std::ostream & operator<<( + std::ostream & os, + const ArgsItem & argsItem + ) + { + os.precision(17); + os << "[ "; + for( size_t i = 0; i < argsItem.arity_nr; ++i ) + os << std::setprecision( + std::numeric_limits::digits10 + 1) << + argsItem.arg[i] << ", "; + os << "]"; + return os; + } + }; + + template< typename T > + auto rsqrt( T const & arg ) -> decltype( std::sqrt( arg ) ) + { + return static_cast(1) / std::sqrt( arg ); + } + +} // math +} // unit +} // test +} // alpaka diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp new file mode 100644 index 0000000000..96fbc28095 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp @@ -0,0 +1,286 @@ +/** Copyright 2019 Jakob Krude, Benjamin Worpitz + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include "Defines.hpp" + +#include + +#include + +namespace alpaka { +namespace test { +namespace unit { +namespace math { + + +// Can be used with operator() that will use either the std. function or the +// equivalent alpaka function (if an accelerator is passed additionally). +//! @param NAME The Name used for the Functor, e.g. OpAbs +//! @param ARITY Enum-type can be one ... n +//! @param STD_OP Function used for the host side, e.g. std::abs +//! @param ALPAKA_OP Function used for the device side, e.g. alpaka::math::abs. +//! @param ... List of Ranges. Needs to match the arity. +#define ALPAKA_TEST_MATH_OP_FUNCTOR( NAME, ARITY, STD_OP, ALPAKA_OP, ... ) \ + struct NAME \ + { \ + /* ranges is not a constexpr, so that it's accessible via for loop*/ \ + static constexpr Arity arity = ARITY; \ + static constexpr size_t arity_nr = static_cast(ARITY); \ + const Range ranges[ arity_nr ] = {__VA_ARGS__}; \ + \ + ALPAKA_NO_HOST_ACC_WARNING \ + template::value, \ + int>::type = 0> \ + ALPAKA_FN_ACC \ + auto execute( \ + TAcc const & acc, \ + TArgs const & ... args ) const \ + { \ + return ALPAKA_OP(acc, args... ); \ + } \ + \ + ALPAKA_NO_HOST_ACC_WARNING \ + template< \ + typename TAcc = std::nullptr_t, \ + typename... TArgs, \ + /* SFINAE: Enables if called from host. */ \ + typename std::enable_if< \ + std::is_same< TAcc, std::nullptr_t>::value, \ + int>::type = 0> \ + ALPAKA_FN_HOST \ + auto execute( \ + TAcc const & acc, \ + TArgs const &... args ) const \ + { \ + alpaka::ignore_unused( acc ); \ + return STD_OP( args... ); \ + } \ + \ + /* assigns args by arity */ \ + ALPAKA_NO_HOST_ACC_WARNING \ + template< \ + typename T, \ + typename TAcc = std::nullptr_t> \ + ALPAKA_FN_HOST_ACC \ + auto operator()( \ + ArgsItem const & args, \ + TAcc const & acc = nullptr) const \ + { \ + return execute(acc, args.arg[0]); \ + } \ + \ + /* assigns args by arity */ \ + ALPAKA_NO_HOST_ACC_WARNING \ + template< \ + typename T, \ + typename TAcc = std::nullptr_t> \ + ALPAKA_FN_HOST_ACC \ + auto operator()( \ + ArgsItem const & args, \ + TAcc const & acc = nullptr) const \ + { \ + return execute(acc, args.arg[0], args.arg[1]); \ + } \ + \ + friend std::ostream & operator << ( \ + std::ostream &out, \ + const NAME &op) \ + { \ + out << #NAME; \ + alpaka::ignore_unused( op ); \ + return out; \ + } \ + }; + + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpAbs, + Arity::Unary, + std::abs, + alpaka::math::abs, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpAcos, + Arity::Unary, + std::acos, + alpaka::math::acos, + Range::OneNeighbourhood ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpAsin, + Arity::Unary, + std::asin, + alpaka::math::asin, + Range::OneNeighbourhood ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpAtan, + Arity::Unary, + std::atan, + alpaka::math::atan, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpCbrt, + Arity::Unary, + std::cbrt, + alpaka::math::cbrt, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpCeil, + Arity::Unary, + std::ceil, + alpaka::math::ceil, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpCos, + Arity::Unary, + std::cos, + alpaka::math::cos, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpErf, + Arity::Unary, + std::erf, + alpaka::math::erf, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpExp, + Arity::Unary, + std::exp, + alpaka::math::exp, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpFloor, + Arity::Unary, + std::floor, + alpaka::math::floor, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpLog, + Arity::Unary, + std::log, + alpaka::math::log, + Range::PositiveOnly ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpRound, + Arity::Unary, + std::round, + alpaka::math::round, + Range::Unrestricted ) + +// There is no std implementation look in Defines.hpp. +ALPAKA_TEST_MATH_OP_FUNCTOR( OpRsqrt, + Arity::Unary, + alpaka::test::unit::math::rsqrt, + alpaka::math::rsqrt, + Range::PositiveOnly ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpSin, + Arity::Unary, + std::sin, + alpaka::math::sin, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpSqrt, + Arity::Unary, + std::sqrt, + alpaka::math::sqrt, + Range::PositiveAndZero ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpTan, + Arity::Unary, + std::tan, + alpaka::math::tan, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpTrunc, + Arity::Unary, + std::trunc, + alpaka::math::trunc, + Range::Unrestricted ) + +// All binary operators. +ALPAKA_TEST_MATH_OP_FUNCTOR( OpAtan2, + Arity::Binary, + std::atan2, + alpaka::math::atan2, + Range::NotZero, + Range::NotZero ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpFmod, + Arity::Binary, + std::fmod, + alpaka::math::fmod, + Range::Unrestricted, + Range::NotZero ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpMax, + Arity::Binary, + std::max, + alpaka::math::max, + Range::Unrestricted, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpMin, + Arity::Binary, + std::min, + alpaka::math::min, + Range::Unrestricted, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpPow, + Arity::Binary, + std::pow, + alpaka::math::pow, + Range::PositiveAndZero, + Range::Unrestricted ) + +ALPAKA_TEST_MATH_OP_FUNCTOR( OpRemainder, + Arity::Binary, + std::remainder, + alpaka::math::remainder, + Range::Unrestricted, + Range::NotZero ) + +using BinaryFunctors = std::tuple< + OpAtan2, + OpFmod, + OpMax, + OpMin, + OpPow, + OpRemainder + >; + +using UnaryFunctors = std::tuple< + OpAbs, + OpAcos, + OpAsin, + OpAtan, + OpCbrt, + OpCeil, + OpCos, + OpErf, + OpExp, + OpFloor, + OpLog, + OpRound, + OpRsqrt, + OpSin, + OpSqrt, + OpTan, + OpTrunc + >; + +} // math +} // unit +} // test +} // alpaka diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp b/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp new file mode 100644 index 0000000000..2849ebe24f --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp @@ -0,0 +1,233 @@ +/** Copyright 2019 Jakob Krude, Benjamin Worpitz + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "Defines.hpp" +#include "Buffer.hpp" +#include "Functor.hpp" +#include "DataGen.hpp" + +#include +#include +#include + +#include + +using TestAccs = alpaka::test::acc::EnabledAccs< + alpaka::dim::DimInt< 1u >, + std::size_t +>; + +using Functors = + alpaka::meta::Concatenate< + alpaka::test::unit::math::UnaryFunctors, + alpaka::test::unit::math::BinaryFunctors + >; + +using TestAccFunctorTuples = + alpaka::meta::CartesianProduct< + std::tuple, + TestAccs, + Functors + >; + +using DataTypes = std::tuple< + float, + double +>; + +struct TestKernel +{ + //! @tparam TAcc Accelerator. + //! @tparam TFunctor Functor defined in Functor.hpp. + //! @param acc Accelerator given from alpaka. + //! @param functor Accessible with operator(). + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + TResults const & results, + TFunctor const & functor, + TArgs const & args) const noexcept + -> void + { + for( size_t i = 0; i < TArgs::capacity; ++i ) + { + results(i, acc) = functor(args(i, acc), acc); + } + } +}; + +//############################################################################# +template< + typename TAcc, + typename TFunctor> +struct TestTemplate +{ + template< + typename TData> + auto operator()() -> void + { + std::random_device rd{}; + auto const seed = rd(); + std::cout << "testing" + << " acc:" << typeid(TAcc).name() + << " data type:" << typeid(TData).name() + << " functor:" << typeid(TFunctor).name() + << " seed:" << seed + << std::endl; + + // SETUP (defines and initialising) + // DevAcc and DevHost are defined in Buffer.hpp too. + using DevAcc = alpaka::dev::Dev< TAcc >; + using DevHost = alpaka::dev::DevCpu; + using PltfAcc = alpaka::pltf::Pltf< DevAcc >; + using PltfHost = alpaka::pltf::Pltf< DevHost >; + + using Dim = alpaka::dim::DimInt< 1u >; + using Idx = std::size_t; + using WorkDiv = alpaka::workdiv::WorkDivMembers; + using QueueAcc = alpaka::test::queue::DefaultQueue< DevAcc >; + using TArgsItem = alpaka::test::unit::math::ArgsItem; + + static constexpr auto capacity = 1000; + + using Args = alpaka::test::unit::math::Buffer< + TAcc, + TArgsItem, + capacity + >; + using Results = alpaka::test::unit::math::Buffer< + TAcc, + TData, + capacity + >; + + // Every functor is executed individual on one kernel. + static constexpr size_t elementsPerThread = 1u; + static constexpr size_t sizeExtent = 1u; + + DevAcc const devAcc{ alpaka::pltf::getDevByIdx< PltfAcc >( 0u ) }; + DevHost const devHost{ alpaka::pltf::getDevByIdx< PltfHost >( 0u ) }; + + QueueAcc queue{ devAcc }; + + TestKernel kernel; + TFunctor functor; + Args args{ devAcc }; + Results results{ devAcc }; + + WorkDiv const workDiv{ + alpaka::workdiv::getValidWorkDiv< TAcc >( + devAcc, + sizeExtent, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted + )}; + // SETUP COMPLETED. + + // Fill the buffer with random test-numbers. + alpaka::test::unit::math::fillWithRndArgs( args, functor, seed ); + for( size_t i = 0; i < Results::capacity; ++i ) + results(i) = static_cast(std::nan( "" )); + + // Copy both buffer to the device + args.copyToDevice(queue); + results.copyToDevice(queue); + + auto const taskKernel( + alpaka::kernel::createTaskKernel< TAcc >( + workDiv, + kernel, + results, + functor, + args + ) + ); + // Enqueue the kernel execution task. + alpaka::queue::enqueue( queue, taskKernel ); + // Copy back the results (encapsulated in the buffer class). + results.copyFromDevice( queue ); + alpaka::wait::wait( queue ); + std::cout.precision( std::numeric_limits::digits10 + 1 ); + + INFO("Operator: " << functor) + INFO("Type: " << typeid( TData ).name() ) // Compiler specific. +#if ALPAKA_DEBUG_FULL + INFO("The args buffer: \n" << std::setprecision( + std::numeric_limits::digits10 + 1) + << args << "\n") +#endif + for( size_t i = 0; i < Args::capacity; ++i ) + { + INFO("Idx i: " << i) + TData std_result = functor(args(i)); + REQUIRE( results(i) == Approx(std_result) ); + } + } +}; + +TEMPLATE_LIST_TEST_CASE("mathOps", "[math] [operator]", TestAccFunctorTuples) +{ + /* + * All alpaka::math:: functions are tested here except sincos. + * The function will be called with a buffer from the custom Buffer class. + * This argument Buffer contains ArgsItems from Defines.hpp and can be + * accessed with the overloaded operator(). + * The args Buffer looks similar like [[0, 1], [2, 3], [4, 5]], + * where every sub-list makes one functor-call so the result Buffer would be: + * [f(0, 1), f(2, 3), f(4, 5)]. + * The results are saved in a different Buffer witch contains plain data. + * The results are than compared to the result of a std:: implementation. + * The default result is nan and should fail a test. + * + * BE AWARE that: + * - ALPAKA_CUDA_FAST_MATH should be disabled + * - not all casts between float and double can be detected. + * - no explicit edge cases are tested, rather than 0, maximum and minimum + * - but it is easy to add a new Range:: enum-type with custom edge cases + * - some tests may fail if ALPAKA_CUDA_FAST_MATH is turned on + * - nan typically fails every test, but could be normal defined behaviour + * - inf/-inf typically dont fail a test + * - for easy debugging the << operator is overloaded for Buffer objects + * - arguments are generated between 0 and 1000 + * and the default argument-buffer-extent is 1000 + * The arguments are generated in DataGen.hpp and can easily be modified. + * The arguments depend on the Range:: enum-type specified for each functor. + * ---------------------------------------------------------------------- + * - each functor has an arity and a array of ranges + * - there is one args Buffer and one results Buffer + * - each buffer encapsulated the host/device communication + * - as well as the data access and the initialisation + * - all operators are tested independent, one per kernel + * - tests the results against the std implementation ( catch REQUIRES) + * + * TestKernel + * - uses the alpaka::math:: option from the functor + * - uses the device-buffer option from the args + * + * EXTENSIBILITY: + * - Add new operators in Functor.hpp and add them to the ...Functors tuple. + * - Add a new Range:: enum-type in Defines.hpp + * - specify a fill-method in DataGen.hpp + * - Add a new Arity:: enum-type in Defines.hpp + * - add a matching operator() function in Functor.hpp, + * - add a new ...Functors tuple + * - call alpaka::meta::forEachType with the tuple in ForEachFunctor + */ + + using Acc = std::tuple_element_t<0u, TestType>; + using Functor = std::tuple_element_t<1u, TestType>; + + alpaka::meta::forEachType< DataTypes >( + TestTemplate< Acc, Functor >()); +} diff --git a/thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp b/thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp similarity index 98% rename from thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp rename to thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp index abda52fafc..2d8d815d43 100644 --- a/thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp +++ b/thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -31,7 +31,6 @@ almost_equal(TAcc const & acc, FP x, FP y, int ulp) || alpaka::math::abs(acc, x-y) < std::numeric_limits::min(); } - class SinCosTestKernel { public: diff --git a/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt index 7c080722b3..c18c594594 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp index a13a80384e..6d5871da56 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -64,7 +64,7 @@ TEMPLATE_LIST_TEST_CASE( "memBufBasicTest", "[memBuf]", alpaka::test::acc::TestA using Dim = alpaka::dim::Dim; using Idx = alpaka::idx::Idx; - auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extent(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); testBufferMutable< Acc>( @@ -123,7 +123,7 @@ TEMPLATE_LIST_TEST_CASE( "memBufConstTest", "[memBuf]", alpaka::test::acc::TestA using Dim = alpaka::dim::Dim; using Idx = alpaka::idx::Idx; - auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extent(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); testBufferImmutable< Acc>( diff --git a/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt new file mode 100644 index 0000000000..9ca808e7b4 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl +# +# This file is part of Alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "bufSlicing") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit") + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS}) diff --git a/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp b/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp new file mode 100644 index 0000000000..9900396768 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp @@ -0,0 +1,314 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Jakob Krude + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include + +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) + #pragma warning(push) + #pragma warning(disable: 4127) // suppress warning for c++17 conditional expression is constant +#endif + +template< + typename TDim, + typename TIdx, + typename TAcc, + typename TData, + typename Vec = alpaka::vec::Vec< + TDim, + TIdx>> +struct TestContainer +{ + + using AccQueueProperty = alpaka::queue::Blocking; + using DevQueue = alpaka::queue::Queue< + TAcc, + AccQueueProperty + >; + using DevAcc = alpaka::dev::Dev; + using PltfAcc = alpaka::pltf::Pltf; + + using DevHost = alpaka::dev::DevCpu; + using PltfHost = alpaka::pltf::Pltf; + + using BufHost = alpaka::mem::buf::Buf< + DevHost, + TData, + TDim, + TIdx + >; + using BufDevice = alpaka::mem::buf::Buf< + DevAcc, + TData, + TDim, + TIdx + >; + + using SubView = alpaka::mem::view::ViewSubView< + DevAcc, + TData, + TDim, + TIdx + >; + + DevAcc const devAcc; + DevHost const devHost; + DevQueue devQueue; + + + // Constructor + TestContainer(): + devAcc(alpaka::pltf::getDevByIdx(0u)), + devHost(alpaka::pltf::getDevByIdx(0u)), + devQueue(devAcc) + {} + + + auto createHostBuffer( + Vec extents, + bool indexed + ) -> BufHost + { + BufHost bufHost( + alpaka::mem::buf::alloc< + TData, + TIdx + >( + devHost, + extents + )); + if(indexed) + { + TData *const ptr = alpaka::mem::view::getPtrNative(bufHost); + for(TIdx i(0);i < extents.prod();++i) + { + ptr[i] = static_cast(i); + } + } + return bufHost; + } + + + auto createDeviceBuffer(Vec extents) -> BufDevice + { + BufDevice bufDevice( + alpaka::mem::buf::alloc< + TData, + TIdx + >( + devAcc, + extents + )); + return bufDevice; + } + + + auto copyToAcc( + BufHost bufHost, + BufDevice bufAcc, + Vec extents + ) -> void + { + alpaka::mem::view::copy( + devQueue, + bufAcc, + bufHost, + extents + ); + } + + + auto copyToHost( + BufDevice bufAcc, + BufHost bufHost, + Vec extents + ) -> void + { + alpaka::mem::view::copy( + devQueue, + bufHost, + bufAcc, + extents + ); + } + + + auto sliceOnDevice( + BufDevice bufferToBeSliced, + Vec subViewExtents, + Vec offsets + ) -> BufDevice + { + BufDevice slicedBuffer = createDeviceBuffer(subViewExtents); + // Create a subView with a possible offset. + SubView subView = SubView( + bufferToBeSliced, + subViewExtents, + offsets + ); + // Copy the subView into a new buffer. + alpaka::mem::view::copy( + devQueue, + slicedBuffer, + subView, + subViewExtents + ); + return slicedBuffer; + } + + + auto compareBuffer( + BufHost const & bufferA, + BufHost const & bufferB, + Vec const & extents + ) const + { + TData const *const ptrA = alpaka::mem::view::getPtrNative(bufferA); + TData const *const ptrB = alpaka::mem::view::getPtrNative(bufferB); + for(TIdx i(0);i < extents.prod();++i) + { + INFO("Dim: " << TDim::value) + INFO("Idx: " << typeid(TIdx).name()) + INFO("Acc: " << alpaka::acc::traits::GetAccName::getAccName()) + INFO("i: " << i) + REQUIRE(ptrA[i] == Approx(ptrB[i])); + } + + } +}; + +using DataTypes = std::tuple< + int, + float, + double +>; + +using TestAccWithDataTypes = +alpaka::meta::CartesianProduct< + std::tuple, + alpaka::test::acc::TestAccs, + DataTypes +>; + +TEMPLATE_LIST_TEST_CASE("memBufSlicingTest", + "[memBuf]", + TestAccWithDataTypes) +{ + using Acc = std::tuple_element_t< + 0, + TestType + >; + using Data = std::tuple_element_t< + 1, + TestType + >; + using Dim = alpaka::dim::Dim; + // fourth-dimension is not supposed to be tested currently + if(Dim::value == 4) + { + return; + } + using Idx = alpaka::idx::Idx; + TestContainer< + Dim, + Idx, + Acc, + Data + > slicingTest; + + auto const extents( + alpaka::vec::createVecFromIndexedFn< + Dim, + alpaka::test::CreateVecWithIdx::template ForExtentBuf + >()); + + auto const extentsSubView( + alpaka::vec::createVecFromIndexedFn< + Dim, + alpaka::test::CreateVecWithIdx::template ForExtentSubView + >()); + auto const offsets( + alpaka::vec::createVecFromIndexedFn< + Dim, + alpaka::test::CreateVecWithIdx::template ForOffset + >()); + + // This is the initial buffer. + auto const indexedBuffer = slicingTest.createHostBuffer( + extents, + true + ); + // This buffer will hold the sliced-buffer when it was copied to the host. + auto resultBuffer = slicingTest.createHostBuffer( + extentsSubView, + false + ); + + // Copy of the indexBuffer on the deviceSide. + auto deviceBuffer = slicingTest.createDeviceBuffer(extents); + + // Start: Main-Test + slicingTest.copyToAcc( + indexedBuffer, + deviceBuffer, + extents + ); + + auto slicedBuffer = slicingTest.sliceOnDevice( + deviceBuffer, + extentsSubView, + offsets + ); + + slicingTest.copyToHost( + slicedBuffer, + resultBuffer, + extentsSubView + ); + + auto correctResults = slicingTest.createHostBuffer( + extentsSubView, + false + ); + Data *ptrNative = alpaka::mem::view::getPtrNative(correctResults); + using Dim1 = alpaka::dim::DimInt<1u>; + + for(Idx i(0);i < extentsSubView.prod();++i) + { + auto mappedToND = alpaka::idx::mapIdx< + Dim::value, + Dim1::value + >( + alpaka::vec::Vec< + Dim1, + Idx + >(i), + extentsSubView + ); + auto addedOffset = mappedToND + offsets; + auto mappedTo1D = alpaka::idx::mapIdx( + addedOffset, + extents + )[0]; // take the only element in the vector + ptrNative[i] = static_cast(mappedTo1D); + } + + // resultBuffer will be compared with the manually computed results. + slicingTest.compareBuffer( + resultBuffer, + correctResults, + extentsSubView + ); +} + +#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) + #pragma warning(pop) +#endif diff --git a/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt index 998823b45b..4685b42b75 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp b/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp index c9a4da99b7..d7afcc4028 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp +++ b/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -72,7 +72,7 @@ TEMPLATE_LIST_TEST_CASE( "memP2PTest", "[memP2P]", alpaka::test::acc::TestAccs) using Dim = alpaka::dim::Dim; using Idx = alpaka::idx::Idx; - auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extent(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); testP2P( extent ); #endif diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt index a3b1ddcd68..8fb02354ce 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp index 1f186bc1a8..d3987c5db8 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -104,7 +104,7 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); auto const extentView(extentBuf); @@ -134,7 +134,7 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); auto const extentView(extentBuf); @@ -164,7 +164,7 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); View view( diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp index a1eb1c094d..4e66df4afa 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp +++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp index ea6a0f4960..83f46af3f7 100644 --- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -142,7 +142,7 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); auto const extentView(extentBuf); @@ -168,11 +168,11 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); - auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); - auto const offsetView(alpaka::vec::Vec::all(static_cast(1))); + auto const extentView(alpaka::vec::createVecFromIndexedFn::template ForExtentSubView>()); + auto const offsetView(alpaka::vec::createVecFromIndexedFn::template ForOffset>()); View view(buf, extentView, offsetView); alpaka::test::mem::view::testViewSubViewMutable(view, buf, dev, extentView, offsetView); @@ -194,11 +194,11 @@ namespace view Dev const dev(alpaka::pltf::getDevByIdx(0u)); - auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); + auto const extentBuf(alpaka::vec::createVecFromIndexedFn::template ForExtentBuf>()); auto buf(alpaka::mem::buf::alloc(dev, extentBuf)); - auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround(Idx())); - auto const offsetView(alpaka::vec::Vec::all(static_cast(1))); + auto const extentView(alpaka::vec::createVecFromIndexedFn::template ForExtentSubView>()); + auto const offsetView(alpaka::vec::createVecFromIndexedFn::template ForOffset>()); View const view(buf, extentView, offsetView); alpaka::test::mem::view::testViewSubViewImmutable(view, buf, dev, extentView, offsetView); diff --git a/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt index dce19a7b41..87382e861b 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp index 7da938ec7c..d686768ed2 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp index 4670fe85c4..4d51dbda5f 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp index 1ea9a9081c..7667590e66 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp index 2376e7217c..267c2bc812 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp index 180d5a333e..e8bd691af3 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp index 41e2b8f2ca..87b7b22b1b 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp index 350353200e..d6f80d8fe9 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp index 2134ebcf1e..7f2a276891 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp index eb0ffc77af..d69d5ef1fb 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp index dba8dd0785..f726a508c2 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp index 36cb4665d4..8be0200d5b 100644 --- a/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt index 32d0f6c2a1..6b586ed1c6 100644 --- a/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp b/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp index 3fb220a3b3..44a245ff57 100644 --- a/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp +++ b/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp b/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp index 9b9b6461d9..b3a715bef3 100644 --- a/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt index 475a6b297b..21101813e7 100644 --- a/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp b/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp index 15fa2f3867..77be04e3df 100644 --- a/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt index 54c3c821ff..0cc9a1e35a 100644 --- a/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp b/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp index dd99b86d02..5d9981097e 100644 --- a/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt index 0f0891af49..229b0ef2e7 100644 --- a/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt +++ b/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt @@ -1,7 +1,7 @@ # # Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan # -# This file is part of Alpaka. +# This file is part of alpaka. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp b/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp index 62cabe584a..71b66620af 100644 --- a/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp +++ b/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp @@ -1,6 +1,6 @@ /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker * - * This file is part of Alpaka. + * This file is part of alpaka. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this diff --git a/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt new file mode 100644 index 0000000000..82eb4d69a5 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan +# +# This file is part of Alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "warp") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit") + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS}) diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp new file mode 100644 index 0000000000..76acf37624 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp @@ -0,0 +1,129 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +class ActivemaskSingleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent == 1); + + ALPAKA_CHECK(*success, alpaka::warp::activemask(acc) == 1u); + } +}; + +//############################################################################# +class ActivemaskMultipleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success, + std::uint64_t inactiveThreadIdx) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent > 1); + + // Test relies on having a single warp per thread block + auto const blockExtent = alpaka::workdiv::getWorkDiv(acc); + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const localThreadIdx = alpaka::idx::getIdx(acc); + auto const threadIdxInWarp = static_cast( + alpaka::idx::mapIdx<1u>( + localThreadIdx, + blockExtent)[0] + ); + + if (threadIdxInWarp == inactiveThreadIdx) + return; + + auto const actual = alpaka::warp::activemask(acc); + using Result = decltype(actual); + Result const allActive = + (Result{1} << static_cast(warpExtent)) - 1; + Result const expected = allActive & + ~(Result{1} << inactiveThreadIdx); + ALPAKA_CHECK( + *success, + actual == expected); + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "activemask", "[warp]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const warpExtent = alpaka::dev::getWarpSize(dev); + if (warpExtent == 1) + { + Idx const gridThreadExtentPerDim = 4; + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::all(gridThreadExtentPerDim)); + ActivemaskSingleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && ALPAKA_ACC_CPU_BT_OMP4_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::vec::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::vec::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::vec::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{ + gridBlockExtent, + blockThreadExtent, + threadElementExtent}; + auto fixture = ExecutionFixture{ workDiv }; + ActivemaskMultipleThreadWarpTestKernel kernel; + for (auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; + inactiveThreadIdx++) + REQUIRE( + fixture( + kernel, + inactiveThreadIdx)); +#endif + } +} diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp new file mode 100644 index 0000000000..ffb60ed325 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp @@ -0,0 +1,130 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +class AllSingleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent == 1); + + ALPAKA_CHECK(*success, alpaka::warp::all(acc, 42) != 0); + ALPAKA_CHECK(*success, alpaka::warp::all(acc, 0) == 0); + } +}; + +//############################################################################# +class AllMultipleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::all(acc, 0) == 0); + ALPAKA_CHECK(*success, alpaka::warp::all(acc, 42) != 0); + + // Test relies on having a single warp per thread block + auto const blockExtent = alpaka::workdiv::getWorkDiv(acc); + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const localThreadIdx = alpaka::idx::getIdx(acc); + auto const threadIdxInWarp = static_cast(alpaka::idx::mapIdx<1u>( + localThreadIdx, + blockExtent)[0]); + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if (threadIdxInWarp % 3) + return; + + for (auto idx = 0; idx < warpExtent; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::all(acc, threadIdxInWarp == idx ? 1 : 0) == 0); + std::int32_t const expected = idx % 3 ? 1 : 0; + ALPAKA_CHECK( + *success, + alpaka::warp::all(acc, threadIdxInWarp == idx ? 0 : 1) == expected); + } + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "all", "[warp]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const warpExtent = alpaka::dev::getWarpSize(dev); + if (warpExtent == 1) + { + Idx const gridThreadExtentPerDim = 4; + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::all(gridThreadExtentPerDim)); + AllSingleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && ALPAKA_ACC_CPU_BT_OMP4_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::vec::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::vec::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::vec::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{ + gridBlockExtent, + blockThreadExtent, + threadElementExtent}; + auto fixture = ExecutionFixture{ workDiv }; + AllMultipleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); +#endif + } +} diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp new file mode 100644 index 0000000000..4c3e1d7840 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp @@ -0,0 +1,130 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +class AnySingleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent == 1); + + ALPAKA_CHECK(*success, alpaka::warp::any(acc, 42) != 0); + ALPAKA_CHECK(*success, alpaka::warp::any(acc, 0) == 0); + } +}; + +//############################################################################# +class AnyMultipleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::any(acc, 0) == 0); + ALPAKA_CHECK(*success, alpaka::warp::any(acc, 42) != 0); + + // Test relies on having a single warp per thread block + auto const blockExtent = alpaka::workdiv::getWorkDiv(acc); + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const localThreadIdx = alpaka::idx::getIdx(acc); + auto const threadIdxInWarp = static_cast(alpaka::idx::mapIdx<1u>( + localThreadIdx, + blockExtent)[0]); + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if (threadIdxInWarp % 5) + return; + + for (auto idx = 0; idx < warpExtent; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::any(acc, threadIdxInWarp == idx ? 0 : 1) == 1); + std::int32_t const expected = idx % 5 ? 0 : 1; + ALPAKA_CHECK( + *success, + alpaka::warp::any(acc, threadIdxInWarp == idx ? 1 : 0) == expected); + } + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "any", "[warp]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const warpExtent = alpaka::dev::getWarpSize(dev); + if (warpExtent == 1) + { + Idx const gridThreadExtentPerDim = 4; + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::all(gridThreadExtentPerDim)); + AnySingleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && ALPAKA_ACC_CPU_BT_OMP4_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::vec::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::vec::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::vec::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{ + gridBlockExtent, + blockThreadExtent, + threadElementExtent}; + auto fixture = ExecutionFixture{ workDiv }; + AnyMultipleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); +#endif + } +} diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp new file mode 100644 index 0000000000..3624b6393b --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp @@ -0,0 +1,136 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +class BallotSingleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent == 1); + + ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 42) == 1u); + ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 0) == 0u); + } +}; + +//############################################################################# +class BallotMultipleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success) const + -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 42) == + (std::uint64_t{1} << warpExtent) - 1); + ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 0) == 0u); + + // Test relies on having a single warp per thread block + auto const blockExtent = alpaka::workdiv::getWorkDiv(acc); + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const localThreadIdx = alpaka::idx::getIdx(acc); + auto const threadIdxInWarp = static_cast(alpaka::idx::mapIdx<1u>( + localThreadIdx, + blockExtent)[0]); + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if (threadIdxInWarp >= warpExtent / 2) + return; + + for (auto idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::ballot(acc, threadIdxInWarp == idx ? 1 : 0) == + std::uint64_t{1} << idx); + // First warpExtent / 2 bits are 1 except bit idx + std::uint64_t const expected = + ((std::uint64_t{1} << warpExtent / 2) - 1) & + ~(std::uint64_t{1} << idx); + ALPAKA_CHECK( + *success, + alpaka::warp::ballot(acc, threadIdxInWarp == idx ? 0 : 1) == + expected); + } + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "ballot", "[warp]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const warpExtent = alpaka::dev::getWarpSize(dev); + if (warpExtent == 1) + { + Idx const gridThreadExtentPerDim = 4; + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::all(gridThreadExtentPerDim)); + BallotSingleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && ALPAKA_ACC_CPU_BT_OMP4_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::vec::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::vec::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::vec::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{ + gridBlockExtent, + blockThreadExtent, + threadElementExtent}; + auto fixture = ExecutionFixture{ workDiv }; + BallotMultipleThreadWarpTestKernel kernel; + REQUIRE( + fixture( + kernel)); +#endif + } +} diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp new file mode 100644 index 0000000000..fc697e7182 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp @@ -0,0 +1,58 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +class GetSizeTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + bool * success, + std::int32_t expectedWarpSize) const + -> void + { + std::int32_t const actualWarpSize = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, actualWarpSize == expectedWarpSize); + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "getSize", "[warp]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const expectedWarpSize = static_cast(alpaka::dev::getWarpSize(dev)); + Idx const gridThreadExtentPerDim = 8; + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::all(gridThreadExtentPerDim)); + GetSizeTestKernel kernel; + REQUIRE( + fixture( + kernel, + expectedWarpSize)); +} diff --git a/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt new file mode 100644 index 0000000000..766afb38d2 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "workDiv") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit") + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS}) diff --git a/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp b/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp new file mode 100644 index 0000000000..4764180db8 --- /dev/null +++ b/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp @@ -0,0 +1,68 @@ +/* Copyright 2020 Sergei Bastrakov + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include + +#include +#include + +#include + + //----------------------------------------------------------------------------- +namespace +{ + template< typename TAcc > + auto getWorkDiv() + { + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + Dev const dev(alpaka::pltf::getDevByIdx(0u)); + auto const gridThreadExtent = alpaka::vec::Vec::all(10); + auto const threadElementExtent = alpaka::vec::Vec::ones(); + auto workDiv = alpaka::workdiv::getValidWorkDiv( + dev, + gridThreadExtent, + threadElementExtent, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted); + return workDiv; + } +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "getValidWorkDiv", "[workDiv]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + // Note: getValidWorkDiv() is called inside getWorkDiv + auto workDiv = getWorkDiv< Acc >(); + alpaka::ignore_unused( workDiv ); +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "isValidWorkDiv", "[workDiv]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::dev::Dev; + using Pltf = alpaka::pltf::Pltf; + + Dev dev(alpaka::pltf::getDevByIdx(0u)); + auto workDiv = getWorkDiv< Acc >(); + // Test both overloads + REQUIRE( alpaka::workdiv::isValidWorkDiv( + alpaka::acc::getAccDevProps< Acc >( dev ), + workDiv)); + REQUIRE( alpaka::workdiv::isValidWorkDiv( + dev, + workDiv)); +} diff --git a/thirdParty/cupla/cuplaConfig.cmake b/thirdParty/cupla/cuplaConfig.cmake index e39a45b826..636f97cd5d 100644 --- a/thirdParty/cupla/cuplaConfig.cmake +++ b/thirdParty/cupla/cuplaConfig.cmake @@ -242,7 +242,7 @@ target_link_libraries( # Find cupla version. ################################################################################ # Please also update the version in `include/cupla/version.hpp` -set(_cupla_VERSION "0.2.0") +set(_cupla_VERSION "0.3.0") ################################################################################ # Set return values. diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp index bcca06f7e0..34881724b8 100644 --- a/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp index 5e5ee8f73d..287bef45cb 100644 --- a/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp index 73d3312ea7..b3e14d3ef3 100644 --- a/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/CpuSerial.hpp b/thirdParty/cupla/include/cupla/config/CpuSerial.hpp index 4823147d50..a88f3541ad 100644 --- a/thirdParty/cupla/include/cupla/config/CpuSerial.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuSerial.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp b/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp index 03c3c25201..643c23d53c 100644 --- a/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/CpuThreads.hpp b/thirdParty/cupla/include/cupla/config/CpuThreads.hpp index d1fbe9461e..036c963f8c 100644 --- a/thirdParty/cupla/include/cupla/config/CpuThreads.hpp +++ b/thirdParty/cupla/include/cupla/config/CpuThreads.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp b/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp index db9829ed6a..e6d52ad480 100644 --- a/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp +++ b/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp b/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp index 302128029f..6195cdf134 100644 --- a/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp +++ b/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp @@ -40,4 +40,4 @@ # include "cupla/../../src/stream.cpp" #endif -#include "cuda_to_cupla.hpp" +#include "cupla.hpp" diff --git a/thirdParty/cupla/include/cupla/manager/Stream.hpp b/thirdParty/cupla/include/cupla/manager/Stream.hpp index 96e9090900..43fc4d3324 100644 --- a/thirdParty/cupla/include/cupla/manager/Stream.hpp +++ b/thirdParty/cupla/include/cupla/manager/Stream.hpp @@ -69,23 +69,7 @@ namespace manager create( ) -> cuplaStream_t { - - auto& device = Device< DeviceType >::get(); - - std::unique_ptr< - QueueType - > streamPtr( - new QueueType( - device.current() - ) - ); - cuplaStream_t streamId = reinterpret_cast< cuplaStream_t >( - m_id++ - ); - m_mapVector[ device.id() ].insert( - std::make_pair( streamId, std::move( streamPtr ) ) - ); - return streamId; + return createNewStream(reinterpret_cast< cuplaStream_t >(m_id++)); } auto @@ -102,7 +86,7 @@ namespace manager { if( streamId == 0 ) { - this->create( ); + createNewStream( streamId ); return this->stream( streamId ); } else @@ -153,8 +137,6 @@ namespace manager const auto deviceId = device.id(); m_mapVector[ deviceId ].clear( ); - // reset id to allow that this instance can be reused - m_id = 0u; // @todo: check if clear creates errors return true; @@ -165,8 +147,26 @@ namespace manager { } - //! unique if for the next stream - size_t m_id = 0u; + auto + createNewStream( cuplaStream_t streamId ) + -> cuplaStream_t + { + + auto& device = Device< DeviceType >::get(); + + auto streamPtr = std::make_unique< QueueType >( device.current() ); + m_mapVector[ device.id() ].insert( + std::make_pair( streamId, std::move( streamPtr ) ) + ); + return streamId; + } + + /** unique id for the next stream + * + * The enumeration starts with id one. Id zero is reserved + * for the default stream. + */ + size_t m_id = 1u; }; diff --git a/thirdParty/cupla/include/cupla/version.hpp b/thirdParty/cupla/include/cupla/version.hpp index 187ee9d8e9..b337ddf5c6 100644 --- a/thirdParty/cupla/include/cupla/version.hpp +++ b/thirdParty/cupla/include/cupla/version.hpp @@ -22,6 +22,6 @@ // Please also update the version in `cuplaConfig.cmake` #define CUPLA_VERSION_MAJOR 0 -#define CUPLA_VERSION_MINOR 2 +#define CUPLA_VERSION_MINOR 3 #define CUPLA_VERSION_PATCH 0 -#define CUPLA_VERSION_LABEL "" +#define CUPLA_VERSION_LABEL "dev" diff --git a/thirdParty/cupla/script/compiler_base.yml b/thirdParty/cupla/script/compiler_base.yml new file mode 100644 index 0000000000..3048a5073b --- /dev/null +++ b/thirdParty/cupla/script/compiler_base.yml @@ -0,0 +1,55 @@ +.base_gcc: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:gcc + variables: + GIT_SUBMODULE_STRATEGY: normal + ALPAKA_ACCS: "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE + ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE + ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE" + # ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE + script: + - source script/run_test.sh + # x86_64 tag is used to get a multi-core CPU for the tests + tags: + - x86_64 + +.base_clang: + image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci:clang + variables: + GIT_SUBMODULE_STRATEGY: normal + ALPAKA_ACCS: "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE + ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE" + # -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON + # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON + script: + - source script/run_test.sh + # x86_64 tag is used to get a multi-core CPU for the tests + tags: + - x86_64 + +.base_cuda: + variables: + GIT_SUBMODULE_STRATEGY: normal + CUPLA_CXX: g++ + ALPAKA_ACCS: "ALPAKA_ACC_GPU_CUDA_ENABLE" + before_script: + - nvidia-smi + - nvcc --version + script: + - source script/run_test.sh + tags: + - cuda + - intel + +.base_cuda_clang: + variables: + GIT_SUBMODULE_STRATEGY: normal + ALPAKA_ACCS: "ALPAKA_ACC_GPU_CUDA_ENABLE" + CUPLA_CMAKE_ARGS: "-DALPAKA_CUDA_COMPILER=clang" + before_script: + - nvidia-smi + - nvcc --version + script: + - source script/run_test.sh + tags: + - cuda + - intel diff --git a/thirdParty/cupla/script/run_test.sh b/thirdParty/cupla/script/run_test.sh new file mode 100755 index 0000000000..55457abfd8 --- /dev/null +++ b/thirdParty/cupla/script/run_test.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +# the default build type is Release +# if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines +# to change the build type, you must set the environment variable CUPLA_BUILD_TYPE + +if [[ ! -v CUPLA_BUILD_TYPE ]] ; then + CUPLA_BUILD_TYPE=Release ; +fi + +################################################### +# cmake config builder +################################################### + +CUPLA_CONST_ARGS="" +CUPLA_CONST_ARGS="${CUPLA_CONST_ARGS} -DCMAKE_BUILD_TYPE=${CUPLA_BUILD_TYPE}" +CUPLA_CONST_ARGS="${CUPLA_CONST_ARGS} ${CUPLA_CMAKE_ARGS}" + +CMAKE_CONFIGS=() +for CXX_VERSION in $CUPLA_CXX; do + for BOOST_VERSION in ${CUPLA_BOOST_VERSIONS}; do + for ACC in ${ALPAKA_ACCS}; do + CMAKE_CONFIGS+=("${CUPLA_CONST_ARGS} -DCMAKE_CXX_COMPILER=${CXX_VERSION} -DBOOST_ROOT=/opt/boost/${BOOST_VERSION} -D${ACC}=ON") + done + done +done + +################################################### +# build an run tests +################################################### + +# use one build directory for all build configurations +mkdir build +cd build + +export cupla_DIR=$CI_PROJECT_DIR + +# ALPAKA_ACCS contains the backends, which are used for each build +# the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda +for CONFIG in $(seq 0 $((${#CMAKE_CONFIGS[*]} - 1))); do + CMAKE_ARGS=${CMAKE_CONFIGS[$CONFIG]} + echo -e "\033[0;32m///////////////////////////////////////////////////" + echo "number of processor threads -> $(nproc)" + cmake --version | head -n 1 + echo "CMAKE_ARGS -> ${CMAKE_ARGS}" + echo -e "/////////////////////////////////////////////////// \033[0m \n\n" + + echo "###################################################" + echo "# Example Matrix Multiplication (adapted original)" + echo "###################################################" + echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" + echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)" + if [[ $CMAKE_ARGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then + cmake $cupla_DIR/example/CUDASamples/matrixMul/ \ + $CMAKE_ARGS + make -j + time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64 + rm -r * ; + fi + + echo "###################################################" + echo "# Example Async API (adapted original)" + echo "###################################################" + echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" + echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)" + if [[ $CMAKE_ARGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then + cmake $cupla_DIR/example/CUDASamples/asyncAPI/ \ + $CMAKE_ARGS + make -j + time ./asyncAPI + rm -r * ; + fi + + echo "###################################################" + echo "# Example Async API (added elements layer)" + echo "###################################################" + cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ \ + $CMAKE_ARGS + make -j + time ./asyncAPI_tuned + rm -r * + + echo "###################################################" + echo "Example vectorAdd (added elements layer)" + echo "###################################################" + cmake $cupla_DIR/example/CUDASamples/vectorAdd/ \ + $CMAKE_ARGS + make -j + time ./vectorAdd 100000 + rm -r * ; +done diff --git a/thirdParty/cupla/script/run_test.yml b/thirdParty/cupla/script/run_test.yml new file mode 100644 index 0000000000..223a241f79 --- /dev/null +++ b/thirdParty/cupla/script/run_test.yml @@ -0,0 +1,57 @@ +.test_job: + script: + # the default build type is Release + # if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines + # to change the build type, you must set the environment variable CUPLA_BUILD_TYPE + - if [[ ! -v CUPLA_BUILD_TYPE ]] ; then + CUPLA_BUILD_TYPE=Release ; + fi + - echo "number of processor threads $(nproc)" + - $CXX --version + - cmake --version + - echo "Boost version-> $BOOST_VERSION" + - export cupla_DIR=$CI_PROJECT_DIR + # use one build directory for all build configurations + - mkdir build + - cd build + - echo "Build type-> $CUPLA_BUILD_TYPE" + # ALPAKA_ACCS contains the backends, which are used for each build + # the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda + - for CMAKE_FLAGS in $ALPAKA_ACCS ; do + echo "###################################################" + && echo "# Example Matrix Multiplication (adapted original)" + && echo "###################################################" + && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" + && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)" + && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then + cmake $cupla_DIR/example/CUDASamples/matrixMul/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE + && make -j + && time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64 + && rm -r * ; + fi + && echo "###################################################" + && echo "# Example Async API (adapted original)" + && echo "###################################################" + && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example" + && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)" + && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then + cmake $cupla_DIR/example/CUDASamples/asyncAPI/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE + && make -j + && time ./asyncAPI + && rm -r * ; + fi + && echo "###################################################" + && echo "# Example Async API (added elements layer)" + && echo "###################################################" + && cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE + && make -j + && time ./asyncAPI_tuned + && rm -r * + && echo "###################################################" + && echo "Example vectorAdd (added elements layer)" + && echo "###################################################" + && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE + && make -j + && time ./vectorAdd 100000 + && rm -r * ; + done diff --git a/thirdParty/cupla/src/event.cpp b/thirdParty/cupla/src/event.cpp index e4abb1c811..3c6a7c9f27 100644 --- a/thirdParty/cupla/src/event.cpp +++ b/thirdParty/cupla/src/event.cpp @@ -43,7 +43,7 @@ cuplaEventCreateWithFlags( >::get().create( flags ); return cuplaSuccess; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC @@ -58,7 +58,7 @@ cuplaEventCreate( >::get().create( 0 ); return cuplaSuccess; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC cuplaError_t @@ -73,7 +73,7 @@ cuplaEventDestroy( cuplaEvent_t event ) return cuplaSuccess; else return cuplaErrorInitializationError; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC cuplaError_t diff --git a/thirdParty/cupla/src/memory.cpp b/thirdParty/cupla/src/memory.cpp index 07486beeae..c1d397b057 100644 --- a/thirdParty/cupla/src/memory.cpp +++ b/thirdParty/cupla/src/memory.cpp @@ -77,7 +77,7 @@ cuplaMallocPitch( *pitch = ::alpaka::mem::view::getPitchBytes< 1u >( buf ); return cuplaSuccess; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC cuplaError_t diff --git a/thirdParty/cupla/src/stream.cpp b/thirdParty/cupla/src/stream.cpp index ba8f579d5a..f89bdb65f3 100644 --- a/thirdParty/cupla/src/stream.cpp +++ b/thirdParty/cupla/src/stream.cpp @@ -43,7 +43,7 @@ cuplaStreamCreate( >::get().create(); return cuplaSuccess; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC cuplaError_t @@ -58,7 +58,7 @@ cuplaStreamDestroy( cuplaStream_t stream ) return cuplaSuccess; else return cuplaErrorInitializationError; -}; +} CUPLA_HEADER_ONLY_FUNC_SPEC cuplaError_t @@ -109,6 +109,6 @@ cuplaStreamQuery( cuplaStream_t stream ) return cuplaSuccess; else return cuplaErrorNotReady; -}; +} } //namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/thirdParty/cupla/test/system/config/kernel.cpp b/thirdParty/cupla/test/system/config/kernel.cpp index 31b1e3e971..2768e0aabc 100644 --- a/thirdParty/cupla/test/system/config/kernel.cpp +++ b/thirdParty/cupla/test/system/config/kernel.cpp @@ -35,6 +35,8 @@ # include #endif +#include "cuda_to_cupla.hpp" + struct IncrementKernel { template diff --git a/thirdParty/cupla/test/system/config/main.cpp b/thirdParty/cupla/test/system/config/main.cpp index fd93bba6d3..e7d0903941 100644 --- a/thirdParty/cupla/test/system/config/main.cpp +++ b/thirdParty/cupla/test/system/config/main.cpp @@ -35,6 +35,8 @@ # include #endif +#include "cuda_to_cupla.hpp" + extern void callIncrementKernel(int* pr_d); int main()