diff --git a/README.md b/README.md new file mode 100644 index 0000000..e43e72f --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Modern C++ for Computational Scientists + +Repository view: + +Pages view: + +Since the 2011 revision to the C++ language and standard library, the +ways it is now being used are quite different. Used well, these +features enable the programmer to write elegant, reusable and portable +code that runs efficiently on a variety of architectures. + +However it is still a very large and complex tool. This set of +lectures and practical exercises, will cover a minimal set of features +to allow an experienced non-C++ programmer to get to grips with +language. These include: +* defining your own types +* overloading +* templates +* containers +* iterators +* lambdas +* standard algorithms +* threading + +It concludes with a brief discussion of modern frameworks for portable +parallel performance which are commonly implemented in C++. + +* [Lectures](lectures/) +* [Practical exercises](exercises/) diff --git a/exercises/README.md b/exercises/README.md new file mode 100644 index 0000000..3961f8f --- /dev/null +++ b/exercises/README.md @@ -0,0 +1,13 @@ +# C++ exercise + +Practical exercises for C++ course + +See each subdirectory for further instructions + +* [Complex numbers](complex/) +* [Containers](containers/) +* [Morton-order matrix class template](morton-order/) +* [Using algorithms](algorithm/) +* [Eigen](eigen/) +* [Simple use of threads](threads/) +* [Portable performance with Kokkos](kokkos/) diff --git a/exercises/algorithm/Makefile b/exercises/algorithm/Makefile new file mode 100644 index 0000000..89717a5 --- /dev/null +++ b/exercises/algorithm/Makefile @@ -0,0 +1,4 @@ +CXXFLAGS = --std=c++17 +CC = $(CXX) + +ex : ex.o diff --git a/exercises/algorithm/README.md b/exercises/algorithm/README.md new file mode 100644 index 0000000..278cb79 --- /dev/null +++ b/exercises/algorithm/README.md @@ -0,0 +1,12 @@ +# Algorithm use exercise + +In the file `ex.cpp` there is an incomplete program which, by +following the instructions in the comments, you can finish. + +You will likely want to refer to the documentation of the standard +library algorithms, which, for reasons, are split across two headers: + +- + +- + diff --git a/exercises/algorithm/ex.cpp b/exercises/algorithm/ex.cpp new file mode 100644 index 0000000..02848ee --- /dev/null +++ b/exercises/algorithm/ex.cpp @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include +#include + +void print_vec(std::vector &vec) { + for (auto &el : vec) { + std::cout << el << " "; + } + std::cout << std::endl << std::endl; +} + +int main(int argc, char* argv[]) { + // First, a warmup of basic algorithm usage + auto nums = std::vector(50); + + // let's initalize our vector with some random numbers + for (int i = 0; i < 50; ++i) { + nums[i] = std::rand() % 100; + } + + // Bonus: can we do this using the algorithms library? Hint - std::generate + // and use the following lambda + auto gen = []() { return std::rand() % 100; }; + + // Your code here.... + + // Now, sort nums. + + // Your code here.... + + std::cout << "Sorted nums: "; + print_vec(nums); + // Reverse sort nums, using (a) sort on its own and (b) using sort and another + // algorithm function + + // Your code here.... + + std::cout << "Reverse sorted nums (a): "; + print_vec(nums); + + // Your code here.... + + std::cout << "Reverse sorted nums (b): "; + print_vec(nums); + + // Now, lets look at a more involved example. We'll be working through Project + // Euler No.2 (https://projecteuler.net/problem=2) "By considering the terms in + // the Fibonacci sequence whose values do not exceed four million, find the sum + // of the even-valued terms" + + // First lets get the fist 47 fibonacci numbers + // BONUS: use std::transform + + auto fibs = std::vector(47); + + // Your code here.... + + + print_vec(fibs); + + // Next, get all that are less than or equal to 4 million, and store them in + // fibs_less HINT: use std::copy_if and std::back_inserter + + auto fibs_less = std::vector(); + + // Your code here.... + + std::cout << "fibs <= 4000000: "; + print_vec(fibs_less); + + // Now, get the evens. Use the same approach as above + auto evens = std::vector(); + + // Your code here.... + + + std::cout << "Evens: "; + print_vec(evens); + + // Finally, let's sum them (hint: std::accumulate) + + int sum = 0; + + std::cout << "Sum of even fibonacci numbers not greater than 4 million: " + << sum << std::endl; + + return 0; +} diff --git a/exercises/complex/Makefile b/exercises/complex/Makefile new file mode 100644 index 0000000..da851f6 --- /dev/null +++ b/exercises/complex/Makefile @@ -0,0 +1,15 @@ +CXXFLAGS = --std=c++17 -I../include + +test : complex.o test.o + $(CXX) $^ -o $@ + +test.o : catch.hpp + +catch.hpp : + wget https://github.com/catchorg/Catch2/releases/download/v2.13.6/catch.hpp + +run : test + ./test + +clean : + rm -rf *.o test diff --git a/exercises/complex/README.md b/exercises/complex/README.md new file mode 100644 index 0000000..f8a502d --- /dev/null +++ b/exercises/complex/README.md @@ -0,0 +1,3 @@ +This document is available in multiple formats: +* [PDF](instructions.pdf) +* [Markdown](instructions.md) diff --git a/exercises/complex/complex.cpp b/exercises/complex/complex.cpp new file mode 100644 index 0000000..a27600a --- /dev/null +++ b/exercises/complex/complex.cpp @@ -0,0 +1,44 @@ +#include "complex.hpp" +#include + + + +double const& Complex::real() { + return re; +} + +double Complex::imag() const { + return im; +} + +Complex Complex::conj() const { + return Complex{re, -im}; +} + +double Complex::norm() const { + return std::sqrt(norm2()); +} + +double Complex::norm2() const { + return re*re + im*im; +} + +bool operator==(Complex const& a, Complex const& b) { + return (a.re == b.re) && (a.im == b.re); +} +bool operator!=(Complex const& a, Complex const& b) { + return !(a == b); +} + +Complex operator+(Complex const& a, Complex const& b) { + return Complex{a.re + b.re, a.im + b.im}; +} + + +Complex operator*(Complex const& a, Complex const& b) { + // (a + ib)*(c + id) == (a*c - b*d) + i(b*c + a*d) +} + +Complex operator-(Complex const& a) { + return Complex{-a.re, -a.im}; +} diff --git a/exercises/complex/complex.hpp b/exercises/complex/complex.hpp new file mode 100644 index 0000000..c623b28 --- /dev/null +++ b/exercises/complex/complex.hpp @@ -0,0 +1,43 @@ +#ifndef CPPEX_COMPLEX_COMPLEX_HPP +#define CPPEX_COMPLEX_COMPLEX_HPP + +// Simple complex number class +class Complex { +public: + // Default value is zero + Complex() = default; + // Construct purely real complex + // Construct from real and imaginary parts + + // Access components + double real() const; + double imag() const; + + // Compute the complex conjugate + Complex conj(); + + // Compute the magnitude and squared magnitude + double norm() const; + double norm2() const; + + // Declare comparisons + friend bool operator==(Complex const& a, Complex const& b); + friend bool operator!=(Complex const& a, Complex const& b); + + // Declare binary arithmetic operators + friend Complex operator+(Complex const& a, Complex const& b); + friend Complex operator-(Complex const& a, Complex const& b); + friend Complex operator*(Complex const& a, Complex const& b); + friend Complex operator/(Complex const& a, Complex const& b); + // Question: how would you declare multiplication and division by a real number? + + // Unary negation + friend Complex operator-(Complex const& a); + +private: + double re = 0.0; + double im = 0.0; +}; + + +#endif diff --git a/exercises/complex/instructions.md b/exercises/complex/instructions.md new file mode 100644 index 0000000..b56090b --- /dev/null +++ b/exercises/complex/instructions.md @@ -0,0 +1,32 @@ +# Introductory C++ exercises +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +The files for this are on [Github](https://github.com/EPCCed/APT-CPP). + +To check out the repository run: + +```bash +git clone https://github.com/EPCCed/APT-CPP +cd APT-CPP/exercises/complex +``` + +## Task + +You have been provided with a simple, *partial* implementation of a complex number class and an associated test program. Your task is to alter the complex.cpp/complex.hpp files until the tests compile and run correctly! (Please don't edit the test.cpp program!). To compile, you need a C++17 capable compiler - on Cirrus the default `gcc` module is fine - then just use `make`. + +This initial program fails: + +```bash +$ module load gcc +$ make +c++ --std=c++17 -I../include -c -o complex.o complex.cpp +complex.cpp:6:24: error: out-of-line definition of 'real' does not match any declaration in 'Complex' +double const& Complex::real() { + ^~~~ +./complex.hpp:13:10: note: member declaration does not match because it is const qualified + double real() const; + +``` + +You need to start tracking down the bugs and fixing them. diff --git a/exercises/complex/instructions.pdf b/exercises/complex/instructions.pdf new file mode 100644 index 0000000..9b2ee9c Binary files /dev/null and b/exercises/complex/instructions.pdf differ diff --git a/exercises/complex/test.cpp b/exercises/complex/test.cpp new file mode 100644 index 0000000..60d8077 --- /dev/null +++ b/exercises/complex/test.cpp @@ -0,0 +1,95 @@ +// Catch2 is a unit testing library +// Here we let it create a main() function for us +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "complex.hpp" + +TEST_CASE("Complex numbers are constructed real/imag parts readable") { + const Complex zero; + REQUIRE(zero.real() == 0.0); + REQUIRE(zero.imag() == 0.0); + + const Complex one{1.0}; + REQUIRE(one.real() == 1.0); + REQUIRE(one.imag() == 0.0); + + const Complex i{0, 1}; + REQUIRE(i.real() == 0.0); + REQUIRE(i.imag() == 1.0); + + const Complex z1{1, -83}; + const Complex z2 = z1; + REQUIRE(i.real() == 0.0); + REQUIRE(i.imag() == 1.0); +} + +TEST_CASE("Complex numbers can be compared") { + const Complex zero; + const Complex one{1.0}; + const Complex i{0, 1}; + REQUIRE(zero == zero); + REQUIRE(one == one); + REQUIRE(i == i); + REQUIRE(zero != one); + REQUIRE(one != i); +} + +TEST_CASE("Complex numbers can have magnitude computed") { + REQUIRE(Complex{}.norm2() == 0.0); + REQUIRE(Complex{3,4}.norm2() == 25.0); +} + +// Pure real => z == z* +void CheckConjReal(double x) { + Complex z{x}; + REQUIRE(z == z.conj()); +} +// Pure imaginary => z* == -z +void CheckConjImag(double y) { + Complex z{0.0, y}; + + REQUIRE(z == -z.conj()); +} + +TEST_CASE("Complex numbers be conjugated") { + CheckConjReal(0); + CheckConjReal(1); + CheckConjReal(-3.14); + CheckConjReal(1.876e6); + + CheckConjImag(0); + CheckConjImag(1); + CheckConjImag(-3.14); + CheckConjImag(1.876e6); +} + +void CheckZplusZeq2Z(const Complex& z) { + REQUIRE(z + z == Complex{2*z.real(), 2*z.imag()}); +} +void CheckZminusZeq0(const Complex& z) { + REQUIRE(z - z == Complex{}); +} + +TEST_CASE("Complex number can be added and subtracted") { + CheckZplusZeq2Z(1); + CheckZplusZeq2Z(0); + CheckZplusZeq2Z(-1); + + CheckZminusZeq0(1); + CheckZminusZeq0(0); + CheckZminusZeq0(-1); + CheckZminusZeq0(Complex{1,2}); + CheckZminusZeq0(Complex{-42, 1e-3}); +} + +TEST_CASE("Complex numbers can be multiplied") { + const Complex i{0, 1}; + Complex z{1}; + z = z*i; + REQUIRE(z == i); + z = z*i; + REQUIRE(z == Complex{-1}); + z = z*i; + REQUIRE(z == -i); +} diff --git a/exercises/containers/Makefile b/exercises/containers/Makefile new file mode 100644 index 0000000..e1cfa1b --- /dev/null +++ b/exercises/containers/Makefile @@ -0,0 +1,10 @@ +CXXFLAGS = --std=c++17 -I../include + +test : vector_ex.o map_ex.o test.o + $(CXX) $^ -o $@ + +run : test + ./test + +clean : + rm -rf *.o test diff --git a/exercises/containers/README.md b/exercises/containers/README.md new file mode 100644 index 0000000..f8a502d --- /dev/null +++ b/exercises/containers/README.md @@ -0,0 +1,3 @@ +This document is available in multiple formats: +* [PDF](instructions.pdf) +* [Markdown](instructions.md) diff --git a/exercises/containers/instructions.md b/exercises/containers/instructions.md new file mode 100644 index 0000000..8f60e0a --- /dev/null +++ b/exercises/containers/instructions.md @@ -0,0 +1,22 @@ +# Containers exercise + +In your clone of this repository, find the `containers` exercise and list +the files + +``` +$ cd APT-CPP/exercises/containers +$ ls +Makefile test.cpp vector_ex.cpp vector_ex.hpp +``` + +As before, `test.cpp` holds some basic unit tests. + +`vector_ex.cpp`/`.hpp` hold some functions that work on `std::vector` - provide the implementations + +The tests require that you implement, in a new header/implementation +pair of files, a function to add a string to a `std::map` as the key, +the value being the length of the string. + +You will want to find the documentatation for `map` on https://en.cppreference.com/ + +You can compile with `make` as before. diff --git a/exercises/containers/instructions.pdf b/exercises/containers/instructions.pdf new file mode 100644 index 0000000..3865e71 Binary files /dev/null and b/exercises/containers/instructions.pdf differ diff --git a/exercises/containers/test.cpp b/exercises/containers/test.cpp new file mode 100644 index 0000000..71d414b --- /dev/null +++ b/exercises/containers/test.cpp @@ -0,0 +1,77 @@ +// Catch2 is a unit testing library +// Here we let it create a main() function for us +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "vector_ex.hpp" +#include "map_ex.hpp" +#include + +// type alias +using IV = std::vector; + +IV IntRange(int n) { + auto ans = IV(n); + for (int i = 0; i < n; ++i) + ans[i] = i; + return ans; +} + +TEST_CASE("Vector Even") { + IV const empty; + REQUIRE(empty == GetEven(empty)); + + auto const zero = IntRange(1); + // Test the testing code! + REQUIRE(zero.size() == 1); + REQUIRE(zero[0] == 0); + + REQUIRE(zero == GetEven(zero)); + + auto const zero_one = IntRange(2); + // Test the testing code! + REQUIRE(zero_one.size() == 2); + REQUIRE(zero_one[0] == 0); + REQUIRE(zero_one[1] == 1); + + REQUIRE(zero == GetEven(zero_one)); + + // edited toddler random numbers + auto const random = IV{8, 127356, 1, 29, 4376, 263478, 1537}; + auto const random_even = IV{ 8, 127356, 4376, 263478}; + REQUIRE(GetEven(random) == random_even); +} + +std::string STR(IV const& v) { + std::stringstream ss; + PrintVectorOfInt(ss, v); + return ss.str(); +} + +TEST_CASE("Vector Print") { + REQUIRE(STR(IV{}) == "[ ]"); + REQUIRE(STR(IntRange(1)) == "[ 0]"); + REQUIRE(STR(IntRange(2)) == "[ 0, 1]"); + // edited toddler random numbers + REQUIRE(STR(IV{8, 127356, 1, 29, 4376, 263478, 1537}) == "[ 8, 127356, 1, 29, 4376, 263478, 1537]"); +} + +using Word2Len = std::map; + +TEST_CASE("Map adding") { + Word2Len wlens; + + SECTION( "Adding a word works") { + bool did_insert = AddWord(wlens, "test"); + REQUIRE(did_insert); + REQUIRE(wlens.size() == 1); + REQUIRE(wlens.find("test") != wlens.end()); + + // Second time must return false + bool did_insert_second_time = AddWord(wlens, "test"); + REQUIRE(!did_insert_second_time); + REQUIRE(wlens.size() == 1); + REQUIRE(wlens.find("test") != wlens.end()); + } + +} diff --git a/exercises/containers/vector_ex.cpp b/exercises/containers/vector_ex.cpp new file mode 100644 index 0000000..00c2828 --- /dev/null +++ b/exercises/containers/vector_ex.cpp @@ -0,0 +1,8 @@ +#include "vector_ex.hpp" + +// std::vector documentation: +// https://en.cppreference.com/w/cpp/container/vector + +std::vector GetEven(std::vector const& source) { +} + diff --git a/exercises/containers/vector_ex.hpp b/exercises/containers/vector_ex.hpp new file mode 100644 index 0000000..22372c4 --- /dev/null +++ b/exercises/containers/vector_ex.hpp @@ -0,0 +1,18 @@ +#ifndef CPPEX_VECTOR_EX_HPP +#define CPPEX_VECTOR_EX_HPP + +#include +#include + +// Here we declare two functions. +// Provide implementations, in vector_ex.cpp + +// Given a vector of integers, return a new vector with only the even +// elements from our input +std::vector GetEven(std::vector const& source); + +// Given a vector of ints, print the data to the stream +// [0, 1] +void PrintVectorOfInt(std::ostream& output, std::vector const& data); + +#endif diff --git a/exercises/eigen/Makefile b/exercises/eigen/Makefile new file mode 100644 index 0000000..10bc750 --- /dev/null +++ b/exercises/eigen/Makefile @@ -0,0 +1,15 @@ +CXX=clang++ +CXX=icpc +CXX=g++ +CPPFLAGS=-O2 -std=c++11 + +all: explicit implicit sparse + +explicit: explicit.o + $(CXX) $(CPPFLAGS) -o $@ $< + +implicit: implicit.o + $(CXX) $(CPPFLAGS) -o $@ $< + +sparse: sparse.o + $(CXX) $(CPPFLAGS) -o $@ $< diff --git a/exercises/eigen/README.md b/exercises/eigen/README.md new file mode 100644 index 0000000..2159542 --- /dev/null +++ b/exercises/eigen/README.md @@ -0,0 +1,6 @@ +# Eigen Demos + +Some simple demos using the Eigen C++ library to solve 1D diffusion +problems. + +See Eigen lecture for details diff --git a/exercises/eigen/explicit.cpp b/exercises/eigen/explicit.cpp new file mode 100644 index 0000000..ba27e73 --- /dev/null +++ b/exercises/eigen/explicit.cpp @@ -0,0 +1,45 @@ +#include + +#include +#include +#include + +int main() +{ + int n = 12; + + std::vector Avec(n * n); + + Eigen::Map A(Avec.data(), n, n); + A = Eigen::MatrixXd::Identity(n, n); + + double delta = 0.4; + + for (int i = 0; i < n - 1; ++i) + { + A(i + 1, i) += delta; + A(i + 1, i + 1) += -delta; + + A(i, i) += -delta; + A(i, i + 1) += +delta; + } + + std::cout << A << std::endl << std::endl; + + Eigen::VectorXd b(n); + b.setZero(); + b.head(n / 2).array() = 1.0; + + std::ofstream f; + f.open("a.txt"); + for (int i = 0; i < 20; ++i) + { + std::cout << b.transpose() << std::endl; + f << b << std::endl << std::endl << std::endl; + b = A * b; + } + + std::cout << b.transpose() << std::endl; + + return 0; +} diff --git a/exercises/eigen/implicit.cpp b/exercises/eigen/implicit.cpp new file mode 100644 index 0000000..2ae3a1d --- /dev/null +++ b/exercises/eigen/implicit.cpp @@ -0,0 +1,42 @@ +#include + +#include +#include + +int main() +{ + int n = 12; + + Eigen::MatrixXd A(n, n); + A = Eigen::MatrixXd::Identity(n, n); + + double delta = -0.4; + + for (int i = 0; i < n - 1; ++i) + { + A(i + 1, i) += delta; + A(i + 1, i + 1) += -delta; + + A(i, i) += -delta; + A(i, i + 1) += +delta; + } + + std::cout << A << std::endl << std::endl; + + Eigen::VectorXd b(n); + b.setZero(); + + b.head(n / 2).array() = 1.0; + + std::ofstream f; + f.open("a.txt"); + for (int i = 0; i < 20; ++i) + { + std::cout << b.transpose() << std::endl; + f << b << std::endl << std::endl << std::endl; + b = A.colPivHouseholderQr().solve(b); + } + + std::cout << b.transpose() << std::endl; + return 0; +} diff --git a/exercises/eigen/modules.sh b/exercises/eigen/modules.sh new file mode 100644 index 0000000..ec998db --- /dev/null +++ b/exercises/eigen/modules.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# To avoid you having to download and install Eigen for yourself, I +# have installed it in a custom module. This `module use` command +# needs to be run once per shell. +module use /work/z04/shared/rwn/modules +module load eigen/3.4.0 + +module load gcc/10.2.0 +module load gnuplot/5.4.0 + diff --git a/exercises/eigen/sparse.cpp b/exercises/eigen/sparse.cpp new file mode 100644 index 0000000..66cded5 --- /dev/null +++ b/exercises/eigen/sparse.cpp @@ -0,0 +1,42 @@ +#include +#include + +#include + +int main() +{ + int n = 10; + + Eigen::SparseMatrix A; + A.resize(n, n); + + double delta = 0.4; + + std::vector> fill; + fill.reserve(n * 4); + + for (int i = 0; i < n - 1; ++i) + { + fill.push_back(Eigen::Triplet(i + 1, i, delta)); + fill.push_back(Eigen::Triplet(i + 1, i + 1, -delta)); + fill.push_back(Eigen::Triplet(i, i, 1.0 - delta)); + fill.push_back(Eigen::Triplet(i, i + 1, delta)); + } + fill.push_back(Eigen::Triplet(n - 1, n - 1, 1.0)); + A.setFromTriplets(fill.begin(), fill.end()); + + std::cout << A << std::endl; + + Eigen::VectorXd b(n); + b.head(n / 2).array() = 1.0; + b.tail(n / 2).array() = 0.0; + + std::cout << b.transpose() << std::endl; + + for (int i = 0; i < 100; ++i) + b = A * b; + + std::cout << b.transpose() << std::endl; + + return 0; +} diff --git a/exercises/include/catch.hpp b/exercises/include/catch.hpp new file mode 100644 index 0000000..e69de29 diff --git a/exercises/kokkos/1/01_Exercise.mk b/exercises/kokkos/1/01_Exercise.mk new file mode 100644 index 0000000..b6d680c --- /dev/null +++ b/exercises/kokkos/1/01_Exercise.mk @@ -0,0 +1,30 @@ +EXE_NAME = "01_Exercise" + +default : $(EXE) + +SRC = $(wildcard *.cpp) + +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.$(KOKKOS_DEVICES).o) +LIB = +include $(KOKKOS_PATH)/Makefile.kokkos + + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: + rm -f *.o *.cuda *.host + +# Compilation rules + +%.$(KOKKOS_DEVICES).o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $@ + +test: $(EXE) + ./$(EXE) diff --git a/exercises/kokkos/1/Makefile b/exercises/kokkos/1/Makefile new file mode 100644 index 0000000..c04643e --- /dev/null +++ b/exercises/kokkos/1/Makefile @@ -0,0 +1,10 @@ + +all: OpenMP + +OpenMP: + $(MAKE) -f $@.mk + +clean: + rm -f *.o *.OpenMP + + diff --git a/exercises/kokkos/1/OpenMP.mk b/exercises/kokkos/1/OpenMP.mk new file mode 100644 index 0000000..5da3e39 --- /dev/null +++ b/exercises/kokkos/1/OpenMP.mk @@ -0,0 +1,3 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/OpenMP +EXE = ${EXE_NAME}.OpenMP +include 01_Exercise.mk diff --git a/exercises/kokkos/1/exercise_1_begin.cpp b/exercises/kokkos/1/exercise_1_begin.cpp new file mode 100644 index 0000000..2d397c2 --- /dev/null +++ b/exercises/kokkos/1/exercise_1_begin.cpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// EXERCISE 1 Goal: +// Use Kokkos to parallelize the outer loop of using Kokkos::parallel_reduce. + +#include +#include +#include +#include +#include + +#include + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + // Check sizes. + checkSizes( N, M, S, nrepeat ); + + // EXERCISE: Initialize Kokkos runtime. + + // Allocate y, x vectors and Matrix A: + double * const y = new double[ N ]; + double * const x = new double[ M ]; + double * const A = new double[ N * M ]; + + // Initialize y vector. + // EXERCISE: Convert outer loop to Kokkos::parallel_for. + for ( int i = 0; i < N; ++i ) { + y[ i ] = 1; + } + + // Initialize x vector. + // EXERCISE: Convert outer loop to Kokkos::parallel_for. + for ( int i = 0; i < M; ++i ) { + x[ i ] = 1; + } + + // Initialize A matrix, note 2D indexing computation. + // EXERCISE: Convert outer loop to Kokkos::parallel_for. + for ( int j = 0; j < N; ++j ) { + for ( int i = 0; i < M; ++i ) { + A[ j * M + i ] = 1; + } + } + + // Timer products. + struct timeval begin, end; + + gettimeofday( &begin, NULL ); + + for ( int repeat = 0; repeat < nrepeat; repeat++ ) { + // Application: = y^T*A*x + double result = 0; + + // EXERCISE: Convert outer loop to Kokkos::parallel_reduce. + for ( int j = 0; j < N; ++j ) { + double temp2 = 0; + + for ( int i = 0; i < M; ++i ) { + temp2 += A[ j * M + i ] * x[ i ]; + } + + result += y[ j ] * temp2; + } + + // Output result. + if ( repeat == ( nrepeat - 1 ) ) { + printf( " Computed result for %d x %d is %lf\n", N, M, result ); + } + + const double solution = (double) N * (double) M; + + if ( result != solution ) { + printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); + } + } + + gettimeofday( &end, NULL ); + + // Calculate time. + double time = 1.0 * ( end.tv_sec - begin.tv_sec ) + + 1.0e-6 * ( end.tv_usec - begin.tv_usec ); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); + + // Print results (problem size, time and bandwidth in GB/s). + printf( " N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time ); + + delete[] A; + delete[] y; + delete[] x; + + // EXERCISE: finalize Kokkos runtime + + return 0; +} + +void checkSizes( int &N, int &M, int &S, int &nrepeat ) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. + if ( S == -1 && ( N == -1 || M == -1 ) ) { + S = pow( 2, 22 ); + if ( S < N ) S = N; + if ( S < M ) S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if ( S == -1 ) S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 = 1024. + if ( N == -1 && M == -1 ) { + if ( S > 1024 ) { + M = 1024; + } + else { + M = S; + } + } + + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; + + // If N is undefined, set it. + if ( N == -1 ) N = S / M; + + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); + + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); + } + + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } +} diff --git a/exercises/kokkos/1/submit.sh b/exercises/kokkos/1/submit.sh new file mode 100644 index 0000000..74c6302 --- /dev/null +++ b/exercises/kokkos/1/submit.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# +#PBS -N kokkos1 +#PBS -q gpu-teach +#PBS -l select=1:ncpus=10:ngpus=1 +#PBS -l walltime=0:01:00 + +# Budget: use either your default or the reservation +#PBS -A $budget + +# Load the required modules +module load gcc cuda kokkos + +cd $PBS_O_WORKDIR + +export OMP_PROC_BIND=spread +export OMP_PLACES=threads + +for threads in 1 2 4 8; do + export OMP_NUM_THREADS=$threads + echo "OMP_NUM_THREADS=$OMP_NUM_THREADS" + for NROWS in 4 6 8 10 12 14 16; do + ./01_Exercise.OpenMP -N $NROWS + done +done + + diff --git a/exercises/kokkos/2/02_Exercise.mk b/exercises/kokkos/2/02_Exercise.mk new file mode 100644 index 0000000..2b8bc6f --- /dev/null +++ b/exercises/kokkos/2/02_Exercise.mk @@ -0,0 +1,26 @@ +EXE_NAME = "02_Exercise" + +default : $(EXE) + +SRC = $(wildcard *.cpp) + +CXXFLAGS = -O3 +LINK = $(CXX) + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.$(KOKKOS_DEVICES).o) + +include $(KOKKOS_PATH)/Makefile.kokkos + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) -o $(EXE) + +clean: + rm -f *.o *.cuda *.host + +# Compilation rules + +%.$(KOKKOS_DEVICES).o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $@ + diff --git a/exercises/kokkos/2/CudaUVM.mk b/exercises/kokkos/2/CudaUVM.mk new file mode 100644 index 0000000..4b454cc --- /dev/null +++ b/exercises/kokkos/2/CudaUVM.mk @@ -0,0 +1,4 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/CudaUVM +EXE = ${EXE_NAME}.CudaUVM +include 02_Exercise.mk + diff --git a/exercises/kokkos/2/Makefile b/exercises/kokkos/2/Makefile new file mode 100644 index 0000000..5f267e2 --- /dev/null +++ b/exercises/kokkos/2/Makefile @@ -0,0 +1,13 @@ + +all: OpenMP CudaUVM + +OpenMP: + $(MAKE) -f $@.mk + +CudaUVM: + $(MAKE) -f $@.mk + +clean: + rm -f *.o *.CudaUVM *.OpenMP + + diff --git a/exercises/kokkos/2/OpenMP.mk b/exercises/kokkos/2/OpenMP.mk new file mode 100644 index 0000000..13d5024 --- /dev/null +++ b/exercises/kokkos/2/OpenMP.mk @@ -0,0 +1,3 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/OpenMP +EXE = ${EXE_NAME}.OpenMP +include 02_Exercise.mk diff --git a/exercises/kokkos/2/exercise_2_begin.cpp b/exercises/kokkos/2/exercise_2_begin.cpp new file mode 100644 index 0000000..8710f28 --- /dev/null +++ b/exercises/kokkos/2/exercise_2_begin.cpp @@ -0,0 +1,235 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// EXERCISE 2 Goal: +// Replace raw allocations with Kokkos Views. +// 1. Define device views. +// 2. Replace data access with view access operators. +// +// Note: Kokkos::parallel_for() initializations were removed to initialize on host. + +#include +#include +#include +#include +#include + +#include + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + // Check sizes. + checkSizes( N, M, S, nrepeat ); + + Kokkos::initialize( argc, argv ); + { + // Allocate y, x vectors and Matrix A + // + // EXERCISE: Create views of the right size. + // 1. Device Views + // Hint: If arrays are not allocated, they also do not need to be + // deallocated below + using ViewVectorType = double*; + using ViewMatrixType = double**; + double* const y = new double[ N ]; + double* const x = new double[ M ];; + double* const A = new double[ N * M ];; + + // Initialize y vector on host. + // EXERCISE: Convert y to 1D View's member access API: x(i) + for ( int i = 0; i < N; ++i ) { + y[ i ] = 1; + } + + // Initialize x vector on host. + // EXERCISE: Convert x to 1D View's member access API: x(i) + for ( int i = 0; i < M; ++i ) { + x[ i ] = 1; + } + + // Initialize A matrix on host, note 2D indexing computation. + // EXERCISE: convert 'A' to use View's member access API: A(j,i) + for ( int j = 0; j < N; ++j ) { + for ( int i = 0; i < M; ++i ) { + A[ j * M + i ] = 1; + } + } + + // Timer products. + struct timeval begin, end; + + gettimeofday( &begin, NULL ); + + for ( int repeat = 0; repeat < nrepeat; repeat++ ) { + // Application: = y^T*A*x + double result = 0; + + Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) { + double temp2 = 0; + + // EXERCISE: Replace access with view access operators. + for ( int i = 0; i < M; ++i ) { + temp2 += A[ j * M + i ] * x[ i ]; + } + + update += y[ j ] * temp2; + }, result ); + + // Ensure that all kernels are finished + Kokkos::fence(); + + // Output result. + if ( repeat == ( nrepeat - 1 ) ) { + printf( " Computed result for %d x %d is %lf\n", N, M, result ); + } + + const double solution = (double) N * (double) M; + + if ( result != solution ) { + printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); + } + } + + gettimeofday( &end, NULL ); + + // Calculate time. + double time = 1.0 * ( end.tv_sec - begin.tv_sec ) + + 1.0e-6 * ( end.tv_usec - begin.tv_usec ); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); + + // Print results (problem size, time and bandwidth in GB/s). + printf( " N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time ); + + //EXERCISE hint... + delete [] y; + delete [] x; + delete [] A; + } + + Kokkos::finalize(); + + return 0; +} + +void checkSizes( int &N, int &M, int &S, int &nrepeat ) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. + if ( S == -1 && ( N == -1 || M == -1 ) ) { + S = pow( 2, 22 ); + if ( S < N ) S = N; + if ( S < M ) S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if ( S == -1 ) S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 = 1024. + if ( N == -1 && M == -1 ) { + if ( S > 1024 ) { + M = 1024; + } + else { + M = S; + } + } + + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; + + // If N is undefined, set it. + if ( N == -1 ) N = S / M; + + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); + + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); + } + + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } +} diff --git a/exercises/kokkos/2/submit.sh b/exercises/kokkos/2/submit.sh new file mode 100644 index 0000000..aa94596 --- /dev/null +++ b/exercises/kokkos/2/submit.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +#PBS -N kokkos2 +#PBS -q gpu-teach +#PBS -l select=1:ncpus=10:ngpus=1 +#PBS -l walltime=0:01:00 + +# Budget: use either your default or the reservation +#PBS -A $budget + +# Load the required modules +module load gcc cuda kokkos + +cd $PBS_O_WORKDIR + +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=10 + +# Pick a random device as PBS on Cirrus not yet configured to control +# GPU visibility +r=$RANDOM; let "r %= 4"; +export CUDA_VISIBLE_DEVICES=$r + + +for NROWS in 4 6 8 10 12 14 16; do + echo "OpenMP version, OMP_NUM_THREADS=$OMP_NUM_THREADS" + ./02_Exercise.OpenMP -N $NROWS + echo "CudaUVM version" + ./02_Exercise.CudaUVM -N $NROWS +done diff --git a/exercises/kokkos/3/03_Exercise.mk b/exercises/kokkos/3/03_Exercise.mk new file mode 100644 index 0000000..c84db51 --- /dev/null +++ b/exercises/kokkos/3/03_Exercise.mk @@ -0,0 +1,30 @@ +EXE_NAME = "03_Exercise" + +default : $(EXE) + +SRC = $(wildcard *.cpp) + +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.$(KOKKOS_DEVICES).o) +LIB = +include $(KOKKOS_PATH)/Makefile.kokkos + + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: + rm -f *.o *.cuda *.host + +# Compilation rules + +%.$(KOKKOS_DEVICES).o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $@ + +test: $(EXE) + ./$(EXE) diff --git a/exercises/kokkos/3/Cuda.mk b/exercises/kokkos/3/Cuda.mk new file mode 100644 index 0000000..37ef0d4 --- /dev/null +++ b/exercises/kokkos/3/Cuda.mk @@ -0,0 +1,4 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/Cuda +EXE = ${EXE_NAME}.Cuda +include 03_Exercise.mk + diff --git a/exercises/kokkos/3/Makefile b/exercises/kokkos/3/Makefile new file mode 100644 index 0000000..3a84d4d --- /dev/null +++ b/exercises/kokkos/3/Makefile @@ -0,0 +1,13 @@ + +all: OpenMP Cuda + +OpenMP: + $(MAKE) -f $@.mk + +Cuda: + $(MAKE) -f $@.mk + +clean: + rm -f *.o *.Cuda *.OpenMP + + diff --git a/exercises/kokkos/3/OpenMP.mk b/exercises/kokkos/3/OpenMP.mk new file mode 100644 index 0000000..981f709 --- /dev/null +++ b/exercises/kokkos/3/OpenMP.mk @@ -0,0 +1,3 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/OpenMP +EXE = ${EXE_NAME}.OpenMP +include 03_Exercise.mk diff --git a/exercises/kokkos/3/exercise_3_begin.cpp b/exercises/kokkos/3/exercise_3_begin.cpp new file mode 100644 index 0000000..3b8349c --- /dev/null +++ b/exercises/kokkos/3/exercise_3_begin.cpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// EXERCISE 3 Goal: +// Manage data transfers from Host to Device manually. +// 1. Define device views. +// 2. Define host views using mirror of corresponding device views. +// 3. Initialize the host views. +// 4. Deep copy host view to device view. + +#include +#include +#include +#include +#include + +#include + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + // Check sizes. + checkSizes( N, M, S, nrepeat ); + + Kokkos::initialize( argc, argv ); + { + // Allocate y, x vectors and Matrix A on device. + typedef Kokkos::View ViewVectorType; + typedef Kokkos::View ViewMatrixType; + ViewVectorType y( "y", N ); + ViewVectorType x( "x", M ); + ViewMatrixType A( "A", N, M ); + + // EXERCISE: Create mirrors of the views on host. + + // Initialize y vector on host. + // EXERCISE: Use host version of y. + for ( int i = 0; i < N; ++i ) { + y( i ) = 1; + } + + // Initialize x vector on host. + // EXERCISE: Use host version of x. + for ( int i = 0; i < M; ++i ) { + x( i ) = 1; + } + + // Initialize A matrix on host. + // EXERCISE: Use host version of A. + for ( int j = 0; j < N; ++j ) { + for ( int i = 0; i < M; ++i ) { + A( j, i ) = 1; + } + } + + // EXERCISE: Perform deep copy of host views to device views. + + // Timer products. + struct timeval begin, end; + + gettimeofday( &begin, NULL ); + + for ( int repeat = 0; repeat < nrepeat; repeat++ ) { + // Application: = y^T*A*x + double result = 0; + + Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) { + double temp2 = 0; + + for ( int i = 0; i < M; ++i ) { + temp2 += A( j, i ) * x( i ); + } + + update += y( j ) * temp2; + }, result ); + + // Output result. + if ( repeat == ( nrepeat - 1 ) ) { + printf( " Computed result for %d x %d is %lf\n", N, M, result ); + } + + const double solution = (double) N * (double) M; + + if ( result != solution ) { + printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); + } + } + + gettimeofday( &end, NULL ); + + // Calculate time. + double time = 1.0 * ( end.tv_sec - begin.tv_sec ) + + 1.0e-6 * ( end.tv_usec - begin.tv_usec ); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); + + // Print results (problem size, time and bandwidth in GB/s). + printf( " N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time ); + + } + + Kokkos::finalize(); + + return 0; +} + +void checkSizes( int &N, int &M, int &S, int &nrepeat ) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. + if ( S == -1 && ( N == -1 || M == -1 ) ) { + S = pow( 2, 22 ); + if ( S < N ) S = N; + if ( S < M ) S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if ( S == -1 ) S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 = 1024. + if ( N == -1 && M == -1 ) { + if ( S > 1024 ) { + M = 1024; + } + else { + M = S; + } + } + + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; + + // If N is undefined, set it. + if ( N == -1 ) N = S / M; + + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); + + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); + } + + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } +} diff --git a/exercises/kokkos/3/submit.sh b/exercises/kokkos/3/submit.sh new file mode 100644 index 0000000..f6e85a4 --- /dev/null +++ b/exercises/kokkos/3/submit.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +#PBS -N kokkos3 +#PBS -q gpu-teach +#PBS -l select=1:ncpus=10:ngpus=1 +#PBS -l walltime=0:01:00 + +# Budget: use either your default or the reservation +#PBS -A $budget + +# Load the required modules +module load gcc cuda kokkos + +cd $PBS_O_WORKDIR + +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=10 + +# Pick a random device as PBS on Cirrus not yet configured to control +# GPU visibility +r=$RANDOM; let "r %= 4"; +export CUDA_VISIBLE_DEVICES=$r + + +for NROWS in 4 6 8 10 12 14 16; do + echo "OpenMP version, OMP_NUM_THREADS=$OMP_NUM_THREADS" + ./03_Exercise.OpenMP -N $NROWS + echo "CudaUVM version" + ./03_Exercise.Cuda -N $NROWS +done diff --git a/exercises/kokkos/4/04_Exercise.mk b/exercises/kokkos/4/04_Exercise.mk new file mode 100644 index 0000000..571fcac --- /dev/null +++ b/exercises/kokkos/4/04_Exercise.mk @@ -0,0 +1,30 @@ +EXE_NAME = "04_Exercise" + +default : $(EXE) + +SRC = $(wildcard *.cpp) + +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.Any.o) +LIB = +include $(KOKKOS_PATH)/Makefile.kokkos + + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: + rm -f *.o *.cuda *.host + +# Compilation rules + +%.Any.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $@ + +test: $(EXE) + ./$(EXE) diff --git a/exercises/kokkos/4/Any.mk b/exercises/kokkos/4/Any.mk new file mode 100644 index 0000000..33db1e6 --- /dev/null +++ b/exercises/kokkos/4/Any.mk @@ -0,0 +1,3 @@ +KOKKOS_PATH = $(KOKKOS_DIR)/All +EXE = ${EXE_NAME}.Any +include 04_Exercise.mk diff --git a/exercises/kokkos/4/Makefile b/exercises/kokkos/4/Makefile new file mode 100644 index 0000000..a6894b4 --- /dev/null +++ b/exercises/kokkos/4/Makefile @@ -0,0 +1,5 @@ +Any: + $(MAKE) -f $@.mk + +clean: + rm -f *.o *.Any diff --git a/exercises/kokkos/4/exercise_4_begin.cpp b/exercises/kokkos/4/exercise_4_begin.cpp new file mode 100644 index 0000000..b3de495 --- /dev/null +++ b/exercises/kokkos/4/exercise_4_begin.cpp @@ -0,0 +1,254 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// EXERCISE 4 Goals: +// Add custom Spaces, Layouts to Views. +// Add custom RangePolicy to Kokkos::parallel_*** calls. +// Experiment by changing the Layouts and Spaces; then test the effect on performance. + +#include +#include +#include +#include +#include + +#include + +void checkSizes( int &N, int &M, int &S, int &nrepeat ); + +int main( int argc, char* argv[] ) +{ + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for ( int i = 0; i < argc; i++ ) { + if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { + N = pow( 2, atoi( argv[ ++i ] ) ); + printf( " User N is %d\n", N ); + } + else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { + M = pow( 2, atof( argv[ ++i ] ) ); + printf( " User M is %d\n", M ); + } + else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { + S = pow( 2, atof( argv[ ++i ] ) ); + printf( " User S is %d\n", S ); + } + else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { + nrepeat = atoi( argv[ ++i ] ); + } + else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { + printf( " y^T*A*x Options:\n" ); + printf( " -Rows (-N) : exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); + printf( " -Columns (-M) : exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); + printf( " -Size (-S) : exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); + printf( " -nrepeat : number of repetitions (default: 100)\n" ); + printf( " -help (-h): print this message\n\n" ); + exit( 1 ); + } + } + + // Check sizes. + checkSizes( N, M, S, nrepeat ); + + Kokkos::initialize( argc, argv ); + + // EXERCISE give-away: Choose an Execution Space. + // using ExecSpace = Kokkos::Serial; + // using ExecSpace = Kokkos::Threads; + // using ExecSpace = Kokkos::OpenMP; + using ExecSpace = Kokkos::Cuda; + + // EXERCISE: Choose device memory space. + // using MemSpace = Kokkos::HostSpace; + // using MemSpace = Kokkos::OpenMP; + // using MemSpace = Kokkos::CudaSpace; + using MemSpace = Kokkos::CudaUVMSpace; + + // EXERCISE give-away: Choose a Layout. + // EXERCISE: When exercise is correctly implemented, then + // either layout will generate the correct answer. + // However, performance will be different! + // using Layout = Kokkos::LayoutLeft; + using Layout = Kokkos::LayoutRight; + + // EXERCISE give-away: Use a RangePolicy. + using range_policy = Kokkos::RangePolicy; + + // Allocate y, x vectors and Matrix A on device. + // EXERCISE: Use MemSpace and Layout. + using ViewVectorType = Kokkos::View; + using ViewMatrixType = Kokkos::View; + { + ViewVectorType y( "y", N ); + ViewVectorType x( "x", M ); + ViewMatrixType A( "A", N, M ); + + // Create host mirrors of device views. + ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y ); + ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x ); + ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A ); + + // Initialize y vector on host. + for ( int i = 0; i < N; ++i ) { + h_y( i ) = 1; + } + + // Initialize x vector on host. + for ( int i = 0; i < M; ++i ) { + h_x( i ) = 1; + } + + // Initialize A matrix on host. + for ( int j = 0; j < N; ++j ) { + for ( int i = 0; i < M; ++i ) { + h_A( j, i ) = 1; + } + } + + // Deep copy host views to device views. + Kokkos::deep_copy( y, h_y ); + Kokkos::deep_copy( x, h_x ); + Kokkos::deep_copy( A, h_A ); + + // Timer products. + struct timeval begin, end; + + gettimeofday( &begin, NULL ); + + for ( int repeat = 0; repeat < nrepeat; repeat++ ) { + // Application: = y^T*A*x + double result = 0; + + // EXERCISE: Use Kokkos::RangePolicy to execute parallel_reduce + // in the correct space. + Kokkos::parallel_reduce(N, + KOKKOS_LAMBDA ( int j, double &update ) { + double temp2 = 0; + + for ( int i = 0; i < M; ++i ) { + temp2 += A( j, i ) * x( i ); + } + + update += y( j ) * temp2; + }, result ); + Kokkos::fence(); + // Output result. + if ( repeat == ( nrepeat - 1 ) ) { + printf( " Computed result for %d x %d is %lf\n", N, M, result ); + } + + const double solution = (double) N * (double) M; + + if ( result != solution ) { + printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); + } + } + + gettimeofday( &end, NULL ); + + // Calculate time. + double time = 1.0 * ( end.tv_sec - begin.tv_sec ) + + 1.0e-6 * ( end.tv_usec - begin.tv_usec ); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double( sizeof(double) * ( M + M * N + N ) ); + + // Print results (problem size, time and bandwidth in GB/s). + printf( " N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time ); + + } + + Kokkos::finalize(); + + return 0; +} + +void checkSizes( int &N, int &M, int &S, int &nrepeat ) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N and M. + if ( S == -1 && ( N == -1 || M == -1 ) ) { + S = pow( 2, 22 ); + if ( S < N ) S = N; + if ( S < M ) S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if ( S == -1 ) S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 = 1024. + if ( N == -1 && M == -1 ) { + if ( S > 1024 ) { + M = 1024; + } + else { + M = S; + } + } + + // If only M is undefined, set it. + if ( M == -1 ) M = S / N; + + // If N is undefined, set it. + if ( N == -1 ) N = S / M; + + printf( " Total size S = %d N = %d M = %d\n", S, N, M ); + + // Check sizes. + if ( ( S < 0 ) || ( N < 0 ) || ( M < 0 ) || ( nrepeat < 0 ) ) { + printf( " Sizes must be greater than 0.\n" ); + exit( 1 ); + } + + if ( ( N * M ) != S ) { + printf( " N * M != S\n" ); + exit( 1 ); + } +} diff --git a/exercises/kokkos/4/submit.sh b/exercises/kokkos/4/submit.sh new file mode 100644 index 0000000..7dc65fa --- /dev/null +++ b/exercises/kokkos/4/submit.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +#PBS -N kokkos4 +#PBS -q gpu-teach +#PBS -l select=1:ncpus=10:ngpus=1 +#PBS -l walltime=0:01:00 + +# Budget: use either your default or the reservation +#PBS -A $budget + +# Load the required modules +module load gcc cuda kokkos + +cd $PBS_O_WORKDIR + +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=10 + +# Pick a random device as PBS on Cirrus not yet configured to control +# GPU visibility +r=$RANDOM; let "r %= 4"; +export CUDA_VISIBLE_DEVICES=$r + + +for NROWS in 4 6 8 10 12 14 16; do + ./04_Exercise.Any -N $NROWS +done + + + diff --git a/exercises/kokkos/README.md b/exercises/kokkos/README.md new file mode 100644 index 0000000..f8a502d --- /dev/null +++ b/exercises/kokkos/README.md @@ -0,0 +1,3 @@ +This document is available in multiple formats: +* [PDF](instructions.pdf) +* [Markdown](instructions.md) diff --git a/exercises/kokkos/dot_prod.png b/exercises/kokkos/dot_prod.png new file mode 100644 index 0000000..0f2c5df Binary files /dev/null and b/exercises/kokkos/dot_prod.png differ diff --git a/exercises/kokkos/instructions.md b/exercises/kokkos/instructions.md new file mode 100644 index 0000000..64d87cb --- /dev/null +++ b/exercises/kokkos/instructions.md @@ -0,0 +1,152 @@ +# Introduction to Kokkos + +This lab will introduce the Kokkos framework for portable performance. + +This mainly follows the training given at SC 2017 and is also +available from +[their GitHub](https://github.com/kokkos/kokkos-tutorials/blob/master/Intro-Short/Slides/KokkosTutorial_SC17.pdf). + +__Note__: you should use these build instructions instead of the ones +in the slides. + +The files for this are on [the course repository](https://github.com/EPCCed/APT-CPP). + +To check out the repository run: + +```bash +git clone https://github.com/EPCCed/APT-CPP +cd APT-CPP/exercies/kokkos +``` + +## Problem: Inner product + +We will be looking at the inner product: + +![Inner product](dot_prod.png) + +where x and y are vectors and A is a matrix. + +## Cirrus +Please ensure you are logged into the node login0. + +Load required modules: +``` +module load gcc cuda kokkos +``` + +## Plots + +We have provided a Jupyter notebook for creating plots of the +performance: [plot.ipynb](plot.ipynb). + +You can run this on your laptop if you have Jupyter installed or on +Cirrus, connecting your local browser via a SSH tunnel. + +To run locally, simply download the notebook to your machine and run +the notebook which should open in your default browser if correctly +configured: + +``` +jupyter-notenook plot.ipynb +``` + +To run the notebook on Cirrus, run the script `run_nb_server.sh` with +two arguments: the port to listen on (must be a unique number between +1024 and 65536) and a password, e.g. +``` +./run_nb_server.sh 6547 s3cret +``` +The script will print further instructions. + + +# 1. Initialise, Use, Finalise + +The corresponding source code can be found in +`APT-CPP/exercies/kokkos/1/exercise_1_begin.cpp` + +In short, you need to set up Kokkos by calling `Kokkos::initialize` +and `Kokkos::finalize` and convert the _outer_ loops to use the +parallel algorithms. The places where you need to change things are +marked with `EXERCISE`. + +On Cirrus, you can simply compile with `make` and the executable will +then use OpenMP to parallelise the loops across available cores. +While you *can* run on the login node for the OpenMP and serial +backends, please submit to the queue. An example job script is +`submit.sh`. Please ensure you set the number of threads to less than +or equal to the number of cores you request in the job script! + +When you run you can set the problem size with command line +flags. These are passed as a power of 2 (e.g. `10 => 2**10 = 1024`). + +* `-N` - the number of rows (default = 12) +* `-M` - the number of columns (default = 10) +* `-S` - the total size (default = 22, must equal sum of M and N) + +Can also specify the number of repeats: +* `-nrepeat` (default = 100) + +When using OpenMP, specify the number of threads to use by setting the +`OMP_NUM_THREADS` environment variable, e.g.: +``` +export OMP_NUM_THREADS=1 +./01_Exercise.OpenMP -N 12 +``` +Or instead use the command line argument `--kokkos-threads=n`. You can +also use the option `--kokkos-numa=2` if you run across the two +sockets on nodes. + +Things to look at: + +* vary problem size +* vary number of cores +* compare Skylake Gold (GPU nodes) vs Broadwell (regular nodes) + +# 2. Use Views + +The corresponding source code is in +`APT-CPP/exercies/kokkos/2/exercise_2_begin.cpp`. + +In this part, you change the data storage from raw arrays to +`Kokkos::View` instantiations. The `View` template class provides +access to elements via `operator()` to allow multidimensional indexing +and thus encapsulation of the actual layout of data. + +Note: we are forcing Kokkos use to unified memory for the CUDA build +so you do not need to use mirror views (yet!). + +Compile with `make` and run the code on the CPU with OpenMP and on the +GPU with CUDA + UVM - see the example submission script for +details. Compare the performance you get for the CPU to the GPU for a +range of problem sizes. + +You can plot results with the plotting notebook explained above. + +# 3. Use Mirror Views + +The corresponding source code is in +`APT-CPP/exercies/kokkos/3/exercise_3_begin.cpp`. + +Here we add `HostMirror` instances to manage transfer of data between +host and GPU memories. Search the source for comments saying +`EXERCISE` to see where to change things. + +Note that we are no longer forcing CUDA to use Unified Memory. + +Compile with make and run the code on a GPU node - see `submit.sh` for +example use. Again, compare performance between CPU and GPU for a +range of problem sizes. + + +# 4. Control the Layout + +The corresponding source code is in +`APT-CPP/exercies/kokkos/4/exercise_4_begin.cpp`. + +Here you will control the layout of the data and experiment with the +combinations of the execution space and data layout. Search the +source for comments saying `EXERCISE` to see where to change things. + +Compile for a variety of memory layouts and execution spaces and run +across a range of problem sizes. Plot the results and try to +understand the variation. diff --git a/exercises/kokkos/instructions.pdf b/exercises/kokkos/instructions.pdf new file mode 100644 index 0000000..d55deb5 Binary files /dev/null and b/exercises/kokkos/instructions.pdf differ diff --git a/exercises/kokkos/plot.ipynb b/exercises/kokkos/plot.ipynb new file mode 100644 index 0000000..2fd7220 --- /dev/null +++ b/exercises/kokkos/plot.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import io\n", + "%matplotlib inline\n", + "\n", + "def scatterplot(data, x, y, **kwargs):\n", + " # Cirrus has an old version of Seaborn - work around lack of sns.scatterplot\n", + " if sns.__version__ < '0.9':\n", + " grid = sns.FacetGrid(data, **kwargs)\n", + " grid.map(plt.scatter, x, y).add_legend()\n", + " return grid\n", + " return sns.scatterplot(data=data, x=x, y=y, **kwargs)\n", + "\n", + "\n", + "def munge_csv_datasets(*dsets):\n", + " ans = []\n", + " for raw in dsets:\n", + " new = pd.read_csv(io.StringIO(raw), sep=', *',engine='python')\n", + " new['nrows'] = 2**new['N']\n", + " new['ncols'] = 2**new['M']\n", + " new.drop(columns=['N', 'M'])\n", + " ans.append(new)\n", + " return pd.concat(ans)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example data - replace with your measurements!\n", + "# CPU results\n", + "raw_cpu = '''exec_space, layout, cores, N, M, bandwidth\n", + "OpenMP, default, 10, 8, 14, 45.981\n", + "OpenMP, default, 10, 10, 12, 45.9073\n", + "OpenMP, default, 10, 12, 10, 45.8484\n", + "OpenMP, default, 10, 14, 8, 45.8484\n", + "OpenMP, default, 10, 16, 6, 43.5089\n", + "'''\n", + "# GPU results. NB: 84 SMs per GV100\n", + "raw_gpu = '''exec_space,layout, cores, N, M, bandwidth\n", + "CudaUVM, default, 84, 8, 14, 10.9454\n", + "CudaUVM, default, 84, 10, 12, 37.5838\n", + "CudaUVM, default, 84, 12, 10, 101.408\n", + "CudaUVM, default, 84, 14, 8, 179.82\n", + "CudaUVM, default, 84, 16, 6, 214.254\n", + "'''\n", + "\n", + "data = munge_csv_datasets(raw_cpu, raw_gpu)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = scatterplot(data, 'nrows', 'bandwidth', col='exec_space', hue='layout')\n", + "fig.set(xscale='log')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/exercises/kokkos/run_nb_server.sh b/exercises/kokkos/run_nb_server.sh new file mode 100755 index 0000000..7f5eeec --- /dev/null +++ b/exercises/kokkos/run_nb_server.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# +# Usage: ./run_nb_server PORT PASSWORD + +module load anaconda/python3 + +echo "Create a tunnel over ssh from your workstation to Cirrus with:" +echo "ssh -NL 8888:0.0.0.0:$1 $USER@login.cirrus.ac.uk" +echo "Then point your browser at http://localhost:8888/notebooks/plot.ipynb" + +jupyter-notebook --no-browser --ip=0.0.0.0 --port $1 --NotebookApp.token=$2 + diff --git a/exercises/morton-order/Makefile b/exercises/morton-order/Makefile new file mode 100644 index 0000000..9d05269 --- /dev/null +++ b/exercises/morton-order/Makefile @@ -0,0 +1,7 @@ +include config.mk +exes = test_bits + +all : $(exes) + +clean : + -rm -f *.o $(exes) diff --git a/exercises/morton-order/README.md b/exercises/morton-order/README.md new file mode 100644 index 0000000..f8a502d --- /dev/null +++ b/exercises/morton-order/README.md @@ -0,0 +1,3 @@ +This document is available in multiple formats: +* [PDF](instructions.pdf) +* [Markdown](instructions.md) diff --git a/exercises/morton-order/bits.hpp b/exercises/morton-order/bits.hpp new file mode 100644 index 0000000..9382028 --- /dev/null +++ b/exercises/morton-order/bits.hpp @@ -0,0 +1,71 @@ +#ifndef MORTON_BITS_HPP +#define MORTON_BITS_HPP +#include + +namespace morton { + // Go from bit pattern like + // abcd + // to: + // 0a0b0c0d + inline uint64_t split(const uint32_t a) { + uint64_t x = a; + x = (x | x << 16) & 0x0000ffff0000ffffUL; + x = (x | x << 8) & 0x00ff00ff00ff00ffUL; + x = (x | x << 4) & 0x0f0f0f0f0f0f0f0fUL; + x = (x | x << 2) & 0x3333333333333333UL; + x = (x | x << 1) & 0x5555555555555555UL; + return x; + } + + // Reverse the above + inline uint32_t pack(const uint64_t z) { + uint64_t x = z; + x &= 0x5555555555555555UL; + x = x >> 1 | x; + x &= 0x3333333333333333UL; + x = x >> 2 | x; + x &= 0x0f0f0f0f0f0f0f0fUL; + x = x >> 4 | x; + x &= 0x00ff00ff00ff00ffUL; + x = x >> 8 | x; + x &= 0x0000ffff0000ffffUL; + x = x >> 16| x; + return x; + } + + // Compute the 2d Morton code for a pair of indices + inline uint64_t encode(const uint32_t x, const uint32_t y) { + return split(x) | split(y) << 1; + } + + // Compute the 2 indices from a Morton index + inline void decode(const uint64_t z, uint32_t& x, uint32_t& y) { + uint64_t i = z; + x = pack(i); + uint64_t j = z >> 1; + y = pack(j); + } + + const uint64_t odd_bit_mask = 0x5555555555555555UL; + const uint64_t even_bit_mask = 0xaaaaaaaaaaaaaaaaUL; + + // Move from (i, j) -> (i - 1, j) + inline uint64_t dec_x(const uint64_t z) { + return (((z & odd_bit_mask) - 1) & odd_bit_mask) | (z & even_bit_mask); + } + // Move from (i, j) -> (i + 1, j) + inline uint64_t inc_x(const uint64_t z) { + return (((z | even_bit_mask) + 1) & odd_bit_mask) | (z & even_bit_mask); + } + + // Move from (i, j) -> (i, j - 1) + inline uint64_t dec_y(const uint64_t z) { + return (z & odd_bit_mask) | (((z & even_bit_mask) - 1) & even_bit_mask); + } + // Move from (i, j) -> (i, j + 1) + inline uint64_t inc_y(const uint64_t z) { + return (z & odd_bit_mask) | (((z | odd_bit_mask) + 1) & even_bit_mask); + } + +} +#endif diff --git a/exercises/morton-order/config.mk b/exercises/morton-order/config.mk new file mode 100644 index 0000000..d7be2c8 --- /dev/null +++ b/exercises/morton-order/config.mk @@ -0,0 +1,2 @@ +CXXFLAGS = -g --std=c++11 -I.. +CC = $(CXX) diff --git a/exercises/morton-order/instructions.md b/exercises/morton-order/instructions.md new file mode 100644 index 0000000..dcbbed9 --- /dev/null +++ b/exercises/morton-order/instructions.md @@ -0,0 +1,138 @@ +# Morton order matrices in C++ +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +Source for this can be obtained from Github. Get a new copy with: + +``` +git clone https://github.com/EPCCed/APT-CPP +``` + +or update your existing one with + +``` +git pull +``` + +then you can + +``` +cd APT-CPP/exercises/morton-order +``` + +The Morton ordering (or z-ordering) of a matrix lays out the elements +along a recursive z-shaped curve, as shown in the figure of four +iterations of the Z-order curve (from +[Wikipedia](https://en.wikipedia.org/wiki/Z-order_curve)). + +![Morton order](mortonorder.png) + +You can compute the Morton index `z` from the x- and y-indices (`i` +and `j` respectively) by interleaving their bits. An example is shown +in the table. + +| | 0 | 1 | 2 | 3 | +|----|----|----|----|----| +| 0 | 0| 1| 4| 5| +| 1 | 2| 3| 6| 7| +| 2 | 8| 9| 12| 13| +| 3 | 10| 11| 14| 15| + +Mapping between `x-y` indexes and Morton index for a 4 by 4 +matrix. Decimal on the left and binary on the right. + +| | 00 | 01 | 10 | 11 | +|----|------|------|------|-----| +| 00 | 0000 | 0001 | 0100 | 0101| +| 01 | 0010 | 0011 | 0110 | 0111| +| 10 | 1000 | 1001 | 1100 | 1101| +| 11 | 1010 | 1011 | 1110 | 1111| + +Mapping between `x-y` indexes and Morton index for a matrix of size +4-by-4. Decimal on the left and binary on the right. + +The advantage of laying out data in this way is that it improves data +locality (and hence cache use) without having to tune a block size or +similar parameter. On a modern multilevel cache machine[^1], this +means it can take good advantage of all the levels without tuning +multiple parameters. + +(E.g. an ARCHER node has L1, L2, and L3 caches, and the RAM is divided +into two NUMA regions. If using a PGAS approach one can view local RAM +as a cache for the distributed memory - i.e. 6 levels!) + +This exercise will walk you through a simple implementation. + +I have included implementations of the functions that do the +"bit-twiddling" for translating between a two-dimensional `x-y` index +and the Morton index, in the file `bits.hpp`. These are reasonably fast, +but can be beaten if you are interested to try! + +In what follows each section corresponds to a subdirectory with the same +number. + +## Implement the underlying data storage and element access + +Go to the step 1 directory: + +```bash +cd APT-CPP/exercises/morton-order/step1 +``` + +Using the partial implemenation in `matrix.hpp`, your task is to +implement the allocation (and release!) of memory to store the data and +to use the helper functions from `bits.hpp` to allow element access. You +will need to implement a number of member functions (marked in the +source with `\\ TODO`) and add whatever data members are needed (marked in +the same way). + +There is a test program `test_matrix_basic.cpp` which runs a few sanity +checks on your implementation (and similarly with `test_bits.cpp`). The +supplied `Makefile` should work. + +## Implement a basic iterator to traverse the matrix in order + +Go to the step 2 directory: + +```bash +cd APT-CPP/exercises/morton-order/step2 +``` + +I have a potential solution to part 1 here, but feel free to copy your +implementation into this. + +The exercise here is to complete the `matrix_iterator` class +that I have started. I've provided most of the boilerplate to have +this work as a "bidirectional iterator". See +http://en.cppreference.com/w/cpp/concept/BidirectionalIterator for +full details of what this means, but basically it's one that can move +forward and backward through the data. + +Again, the things that need added are marked with `\\TODO`. The most +important thing to think about is how you will refer to the current +position and be able to traverse through it efficiently in Morton +order - the performance should be identical to looping over a raw +pointer! + +## Adapt your code to work with any datatype using templates + +This part and the next to be attempted after the lecture on templates! + +In the step 3 directory, and using your solution from part 2, refactor +the `matrix` class into a template class with a single parameter, the +type of the contained element. Remember that template definitions have +to be available to the compiler when they are used by client code! + +Places you will have to modify the source are marked with `TODO`. + +## Allowing iteration over const matrices + +The current implementation of `matrix_iterator` does not work if the +`matrix` being used is `const`. We could implement a new class +template `matrix_const_iterator` which would be a copy and paste of +`matrix_iterator` but with lots of `const` added. + +In the step 4 directory, adapting the `matrix_iterator` template to +work for `T` and `T const` and use this appropriately in the `matrix` +template. Places you will have to modify the source are marked with +`TODO`. diff --git a/exercises/morton-order/instructions.pdf b/exercises/morton-order/instructions.pdf new file mode 100644 index 0000000..575c06b Binary files /dev/null and b/exercises/morton-order/instructions.pdf differ diff --git a/exercises/morton-order/mortonorder.png b/exercises/morton-order/mortonorder.png new file mode 100644 index 0000000..b36cf29 Binary files /dev/null and b/exercises/morton-order/mortonorder.png differ diff --git a/exercises/morton-order/range.hpp b/exercises/morton-order/range.hpp new file mode 100644 index 0000000..edf43fb --- /dev/null +++ b/exercises/morton-order/range.hpp @@ -0,0 +1,133 @@ +/* range.hpp + * + * Copyright (c) 2013 Alexander Duchene + * + * This piece of software was created as part of the Drosophila Population + * Genomics Project opensource agreement. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RANGE_ITERATOR_HPP +#define RANGE_ITERATOR_HPP +#include + +namespace rangepp { + + template + class range_impl{ + private: + value_t rbegin; + value_t rend; + value_t step; + int step_end; + public: + range_impl(value_t begin, value_t end, value_t step=1): + rbegin(begin),rend(end),step(step){ + step_end=(rend-rbegin)/step; + if(rbegin+step_end*step != rend){ + step_end++; + } + } + + class iterator: + public std::iterator + { + private: + value_t current_value; + int current_step; + range_impl& parent; + public: + iterator(int start,range_impl& parent): current_step(start), parent(parent){current_value=parent.rbegin+current_step*parent.step;} + value_t operator*() {return current_value;} + const iterator* operator++(){ + current_value+=parent.step; + current_step++; + return this; + } + const iterator* operator++(int){ + current_value+=parent.step; + current_step++; + return this; + } + bool operator==(const iterator& other) { + return current_step==other.current_step; + } + bool operator!=(const iterator& other) { + return current_step!=other.current_step; + } + iterator operator+(int s) { + iterator ret=*this; + ret.current_step+=s; + ret.current_value+=s*parent.step; + return ret; + } + iterator operator-(int s){ + iterator ret=*this; + ret.current_step-=s; + ret.current_value-=s*parent.step; + return ret; + } + const iterator* operator--(){ + current_value-=parent.step; + current_step--; + return this;} + iterator operator--(int){ + iterator old=*this; + current_value-=parent.step; + current_step--; + return old; + } + }; + + iterator begin(){ + return iterator(0,*this); + } + iterator end(){ + return iterator(step_end,*this); + } + + value_t operator[](int s){ + return rbegin+s*step; + } + + int size(){ + return step_end; + } + }; +} +template +auto range(other begin, other end, vt stepsize)->rangepp::range_impl +{ + + return rangepp::range_impl(begin,end,stepsize); +} + +template +auto range(b begin, e end) -> rangepp::range_impl +{ + return rangepp::range_impl(begin,end,1); +} + +template +rangepp::range_impl range(e end){ + return rangepp::range_impl(0,end,1); +} + +#endif diff --git a/exercises/morton-order/step1/Makefile b/exercises/morton-order/step1/Makefile new file mode 100644 index 0000000..290dfeb --- /dev/null +++ b/exercises/morton-order/step1/Makefile @@ -0,0 +1,12 @@ +include ../config.mk + +exes = test_matrix_base + +all : $(exes) + +test_matrix_base : test_matrix_base.o matrix.o + +%.o : matrix.hpp + +clean : + -rm -f *.o $(exes) diff --git a/exercises/morton-order/step1/matrix.cpp b/exercises/morton-order/step1/matrix.cpp new file mode 100644 index 0000000..923c723 --- /dev/null +++ b/exercises/morton-order/step1/matrix.cpp @@ -0,0 +1,55 @@ +#include "matrix.hpp" + +#include + +namespace morton { + + // TODO - allocate some memory + matrix::matrix(uint32_t r) + { + // Check it's a power of 2. Could consider throwing an + // exception, but these are not in the syllabus! + assert((r & (r-1)) == 0); + } + + // Moving is allowed + // TODO - if you need to provide these, do so here + matrix::matrix(matrix&& other) noexcept {} + matrix& matrix::operator=(matrix&& other) noexcept {} + + // Destructor + // TODO - will the default implemenation be OK? + matrix::~matrix() {} + + // Create a new matrix with contents copied from this one + matrix matrix::duplicate() const { + // TODO + } + + // Get rank size + uint32_t matrix::rank() const { + return _rank; + } + + // Get total size + uint64_t matrix::size() const { + return uint64_t(_rank) * uint64_t(_rank); + } + + // TODO + // Const element access + const int& matrix::operator()(uint32_t i, uint32_t j) const { + } + + // TODO + // Mutable element access + int& matrix::operator()(uint32_t i, uint32_t j) { + } + + // TODO + // Raw data access (const and mutable versions) + const int* matrix::data() const { + } + int* matrix::data() { +} + diff --git a/exercises/morton-order/step1/matrix.hpp b/exercises/morton-order/step1/matrix.hpp new file mode 100644 index 0000000..a29df6a --- /dev/null +++ b/exercises/morton-order/step1/matrix.hpp @@ -0,0 +1,65 @@ +#ifndef MORTON_MATRIX_HPP +#define MORTON_MATRIX_HPP + +#include +#include "bits.hpp" + +namespace morton { + + // 2D square matrix of ints that stores data in Morton order + // + // NB: + // + // - This simple implementation requires that the size be a power + // of 2 (or zero indicating an empty matrix) + // + // - The matrix does not need to be resizeable + // + // - The matrix must not be implicitly copiable, must use explicit + // duplicate member function + class matrix { + private: + // rank of matrix + uint32_t _rank; + // Data storage + // TODO - choose how to store data and manage that memory + + public: + // TODO - anything needed? + matrix(); + + // Construct with appropriate memory for an r by r matrix. + matrix(uint32_t r); + + // Implicit copying is not allowed + // TODO: do we need to do anything to prevent this? + + // Moving is allowed + // TODO - will the default implementations be OK? + matrix(matrix&& other) noexcept; + matrix& operator=(matrix&& other) noexcept; + + // Destructor + // TODO - will the default implemenation be OK? + ~matrix(); + + // Create a new matrix with contents copied from this one + matrix duplicate() const; + + // Get rank size + uint32_t rank() const; + + // Get total size + uint64_t size() const; + + // Const element access + const int& operator()(uint32_t i, uint32_t j) const; + // Mutable element access + int& operator()(uint32_t i, uint32_t j); + + // Raw data access (const and mutable versions) + const int* data() const; + int* data(); + }; +} +#endif diff --git a/exercises/morton-order/step1/test_matrix_base.cpp b/exercises/morton-order/step1/test_matrix_base.cpp new file mode 100644 index 0000000..b998684 --- /dev/null +++ b/exercises/morton-order/step1/test_matrix_base.cpp @@ -0,0 +1,112 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +bool test_small() { + const int N = 4; + morton::matrix small{N}; + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + small(i, j) = i*N + j; + + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + auto data = small.data(); + const std::vector expected = { + 0, 4, 1, 5, 8, 12, 9, 13, + 2, 6, 3, 7,10, 14,11, 15 + }; + for (auto z : range(N*N)) { + TEST_ASSERT_EQUAL(expected[z], data[z]); + } + return true; +} + +// Helper for making a matrix filled down the diagonal. +// This touches plenty of the pages to ensure mem is actually allocated. +morton::matrix make_diag(uint32_t rank) { + auto mat = morton::matrix{rank}; + // fill diagonal + for (auto i: range(rank)) + mat(i,i) = i; + return mat; +} + +bool test_large() { + const int logN = 10; + const int N = 1 << logN; + auto mat = make_diag(N); + + auto data = mat.data(); + uint64_t z = 0; + // pretty easy to convince yourself that the "last" index in each + // successively bigger quad (starting at the origin) is (n^2 - 1) + // where n is the linear size of that. + + // So in the below we're talking about the values 0, 3, 15 + + // 0 1 4 5 + // 2 3 6 7 + // 8 9 12 13 + //10 11 14 15 + + + for (auto i: range(logN+1)) { + auto n = 1 << i; + auto z = n*n - 1; + TEST_ASSERT_EQUAL(n - 1, data[z]); + } + + return true; +} + +bool test_move() { + auto m1 = make_diag(4); + auto m2 = make_diag(8); + + m2 = std::move(m1); + // m1 is now moved-from: we can't do anything except destroy it or + // assign a new value + TEST_ASSERT_EQUAL(4, m2.rank()); + + // Test return of operator== + const morton::matrix& matref = (m1 = std::move(m2)); + // Also test the const element access version of operator() + for (auto i: range(4)) + for (auto j: range(4)) + TEST_ASSERT_EQUAL(matref(i,j), m1(i,j)); + + + return true; +} + +// Try to ensure we really are deleting used memory +bool test_free() { + const int logN = 10; + const int N = 1 << logN; + for (auto j: range(10000)) + auto mat = make_diag(N); + + return true; +} + + +int main() { + static_assert(!std::is_copy_constructible::value, + "Require that morton matrix is not copyable"); + static_assert(std::is_move_constructible::value, + "Require that morton matrix is moveable"); + RUN_TEST(test_small); + RUN_TEST(test_large); + RUN_TEST(test_move); + RUN_TEST(test_free); + return 0; +} diff --git a/exercises/morton-order/step2/Makefile b/exercises/morton-order/step2/Makefile new file mode 100644 index 0000000..c8d431f --- /dev/null +++ b/exercises/morton-order/step2/Makefile @@ -0,0 +1,13 @@ +include ../config.mk +exes = test_matrix_base test_matrix_iter + +all : $(exes) + +test_matrix_base : test_matrix_base.o matrix.o + +test_matrix_iter : test_matrix_iter.o matrix.o + +%.o : matrix.hpp + +clean : + -rm -f *.o $(exes) diff --git a/exercises/morton-order/step2/matrix.cpp b/exercises/morton-order/step2/matrix.cpp new file mode 100644 index 0000000..ea2f262 --- /dev/null +++ b/exercises/morton-order/step2/matrix.cpp @@ -0,0 +1,94 @@ +#include "matrix.hpp" + +#include + +namespace morton { + + matrix::matrix(uint32_t r) + : _rank(r), _data(new int[r*r]) + { + // Check it's a power of 2. Could consider throwing an + // exception, but these are not in the syllabus! + assert((r & (r-1)) == 0); + } + + // Create a new matrix with contents copied from this one + matrix matrix::duplicate() const { + matrix ans(_rank); + std::copy(data(), data() + size(), ans.data()); + return ans; + } + + // Get rank size + uint32_t matrix::rank() const { + return _rank; + } + + // Get total size + uint64_t matrix::size() const { + return uint64_t(_rank) * uint64_t(_rank); + } + + // Const element access + const int& matrix::operator()(uint32_t i, uint32_t j) const { + auto z = encode(i, j); + return _data[z]; + } + + // Mutable element access + int& matrix::operator()(uint32_t i, uint32_t j) { + auto z = encode(i, j); + return _data[z]; + } + + // Raw data access (const and mutable versions) + const int* matrix::data() const { + return _data.get(); + } + int* matrix::data() { + return _data.get(); + } + + // Mutable iterators + // TODO: implement functions to get iterators to first and + // just-past-the-last elements in the matrix + // NB: use trailing return type to have access to names defined + // inside the class namespace. + auto matrix::begin() -> iterator { + } + auto matrix::end() -> iterator { + } + + // TODO: implement constructors. + + // TODO + // Get the x/y coordinates of the current element + uint32_t matrix_iterator::x() const { + } + uint32_t matrix_iterator::y() const { + } + + // Dereference operator + auto matrix_iterator::operator*() const -> reference { + // TODO + } + + // Preincrement operator + matrix_iterator& matrix_iterator::operator++() { + // TODO + } + // TODO + // Predecrement operator + matrix_iterator& matrix_iterator::operator--() { + // TODO + } + // Comparison operators. + bool operator==(const matrix_iterator& a, const matrix_iterator& b) { + // TODO + } + // Note this can be done in terms of the above + bool operator!=(const matrix_iterator& a, const matrix_iterator& b) { + return !(a == b); + } +} + diff --git a/exercises/morton-order/step2/matrix.hpp b/exercises/morton-order/step2/matrix.hpp new file mode 100644 index 0000000..0154bb8 --- /dev/null +++ b/exercises/morton-order/step2/matrix.hpp @@ -0,0 +1,130 @@ +#ifndef MORTON_MATRIX_HPP +#define MORTON_MATRIX_HPP + +#include +#include +#include +#include "bits.hpp" + +namespace morton { + // Forward declare the iterator + class matrix_iterator; + + // 2D square matrix of ints that stores data in Morton order + // + // NB: + // + // - This simple implementation requires that the size be a power + // of 2 (or zero indicating an empty matrix) + // + // - The matrix does not need to be resizeable + // + // - The matrix must not be implicitly copiable, must use explicit + // duplicate member function + class matrix { + public: + using iterator = matrix_iterator; + + private: + // rank of matrix + uint32_t _rank; + // Data storage + // Note using array version of unique_ptr + std::unique_ptr _data; + + public: + // Default initialisation of unique_ptr is OK + matrix() = default; + + // Construct with appropriate memory for an r by r matrix. + matrix(uint32_t r); + + // Implicit copying is not allowed + // Use of unique_ptr prevents copying. + + // Moving is allowed + // Default is ok because of choice to use unique_ptr to manage data storage + + // Destructor + // Default ok because of unique_ptr + + // Create a new matrix with contents copied from this one + matrix duplicate() const; + + // Get rank size + uint32_t rank() const; + + // Get total size + uint64_t size() const; + + // Const element access + const int& operator()(uint32_t i, uint32_t j) const; + // Mutable element access + int& operator()(uint32_t i, uint32_t j); + + // Raw data access (const and mutable versions) + const int* data() const; + int* data(); + + // Mutable iterators to beginning and just-past-the-end + iterator begin(); + iterator end(); + }; + + // Iterator type for matrices + class matrix_iterator { + public: + // Here we exposed the member types required by the standard + // library's concept of an iterator. See: + // https://en.cppreference.com/w/cpp/iterator/iterator_traits + using value_type = int; + using difference_type = int64_t; + using reference = int&; + using pointer = int*; + // This means we can go forwards and backwards. + // Upgrading to a random access iterator would be eay but require + // quite a lot of additional operations. + using iterator_category = std::bidirectional_iterator_tag; + + private: + // What members are required? + + public: + // Default constructor + // TODO: is this needed? + matrix_iterator(); + + // Note: must provide copy c'tor, copy assign + // TODO: Decide if the default copy/move/destruct behaviour is + // going to be OK. + + // Get the x/y coordinates of the current element + uint32_t x() const; + uint32_t y() const; + + // Comparison operators. Note these are inline non-member friend + // functions. + friend bool operator==(const matrix_iterator& a, const matrix_iterator& b); + // Note this can be done in terms of the above + friend bool operator!=(const matrix_iterator& a, const matrix_iterator& b); + + // Dereference operator + reference operator*() const; + + // Preincrement operator + matrix_iterator& operator++(); + // Predecrement operator + matrix_iterator& operator--(); + + private: + // TODO: declare and define appropriate constructor(s) to create + // iterators pointing into a matrix's data. + // matrix_iterator(...); + + // Other constructors should probably not be publicly visible, so + // we need to allow matrix access. + friend matrix; + }; + +} +#endif diff --git a/exercises/morton-order/step2/test_matrix_base.cpp b/exercises/morton-order/step2/test_matrix_base.cpp new file mode 120000 index 0000000..df4b3fc --- /dev/null +++ b/exercises/morton-order/step2/test_matrix_base.cpp @@ -0,0 +1 @@ +../step1/test_matrix_base.cpp \ No newline at end of file diff --git a/exercises/morton-order/step2/test_matrix_iter.cpp b/exercises/morton-order/step2/test_matrix_iter.cpp new file mode 100644 index 0000000..3825e9f --- /dev/null +++ b/exercises/morton-order/step2/test_matrix_iter.cpp @@ -0,0 +1,79 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +morton::matrix make_filled(int N) { + morton::matrix mat(N); + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + mat(i, j) = i*N + j; + return mat; +} + +// M = matrix or const matrix +template +bool check_mat(M& mat) { + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + // This will store the count of elements visited before the current + // and so should be the Morton index of the element. + int z = 0; + auto it = mat.begin(); + for (; it != mat.end(); ++it, ++z) { + uint32_t i, j; + morton::decode(z, i, j); + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(N*N, z); + + return true; +} + +bool test_mut_iter() { + constexpr int N = 4; + auto mat = make_filled(N); + return check_mat(mat); +} + +bool test_rev_iter() { + const int N = 4; + auto mat = make_filled(N); + + int z = N*N; + auto it = mat.end(); + for (; it != mat.begin();) { + // We want to run the below for it == mat.begin() and then finish + // so move decrement inside the body. + --it; + --z; + + uint32_t i, j; + morton::decode(z, i, j); + + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(0, z); + return true; +} + +int main() { + RUN_TEST(test_mut_iter); + RUN_TEST(test_rev_iter); + return 0; +} diff --git a/exercises/morton-order/step3/Makefile b/exercises/morton-order/step3/Makefile new file mode 100644 index 0000000..3d92d61 --- /dev/null +++ b/exercises/morton-order/step3/Makefile @@ -0,0 +1,13 @@ +include ../config.mk +exes = test_matrix_base test_matrix_iter + +all : $(exes) + +test_matrix_base : test_matrix_base.o + +test_matrix_iter : test_matrix_iter.o + +%.o : matrix.hpp + +clean : + -rm -f *.o $(exes) diff --git a/exercises/morton-order/step3/matrix.hpp b/exercises/morton-order/step3/matrix.hpp new file mode 100644 index 0000000..6665c46 --- /dev/null +++ b/exercises/morton-order/step3/matrix.hpp @@ -0,0 +1,129 @@ +#ifndef MORTON_MATRIX_HPP +#define MORTON_MATRIX_HPP + +#include +#include +#include +#include "bits.hpp" + +namespace morton { + // Forward declare the iterator + // TODO: any changes? + class matrix_iterator; + + // 2D square matrix of ints that stores data in Morton order + // + // NB: + // + // - This simple implementation requires that the size be a power + // of 2 (or zero indicating an empty matrix) + // + // - The matrix does not need to be resizeable + // + // - The matrix must not be implicitly copiable, must use explicit + // duplicate member function + // TODO: convert to a template class + class matrix { + public: + //TODO: it is common practice to make a type alias to the element + // type under the name `value_type`. + //TODO: any changes? + using iterator = matrix_iterator; + + private: + // rank of matrix + uint32_t _rank; + // Data storage + // Note using array version of unique_ptr + std::unique_ptr _data; + + public: + // Default initialisation of unique_ptr is OK + matrix() = default; + + // Construct with appropriate memory for an r by r matrix. + matrix(uint32_t r); + + // Create a new matrix with contents copied from this one + matrix duplicate() const; + } + + // Get rank size + uint32_t rank() const; + + // Get total size + uint64_t size() const; + + // Const element access + const value_type& operator()(uint32_t i, uint32_t j) const; + + // Mutable element access + value_type& operator()(uint32_t i, uint32_t j); + + + // Raw data access (const and mutable versions) + const value_type* data() const; + value_type* data(); + + // Mutable iterators + iterator begin(); + iterator end(); + }; + + // Iterator type for matrices + template + class matrix_iterator { + public: + // Here we exposed the member types required by the standard + // library's concept of an iterator. See: + // https://en.cppreference.com/w/cpp/iterator/iterator_traits + // TODO: update for template case + using value_type = int; + using difference_type = int64_t; + using reference = int&; + using pointer = int*; + // This means we can go forwards and backwards. + // Upgrading to a random access iterator would be eay but require + // quite a lot of additional operations. + using iterator_category = std::bidirectional_iterator_tag; + + private: + // We need the pointer to the first element to work out where we + // are in the matrix for x/y calculation. + pointer _start = nullptr; + pointer _ptr = nullptr; + + public: + // Default constructor + matrix_iterator() = default; + + // Note: must provide copy c'tor, copy assign + // Defaults are fine because this does not own data + + // Get the x/y coordinates of the current element + uint32_t x() const; + uint32_t y() const; + + // Comparison operators. + friend bool operator==(const matrix_iterator& a, const matrix_iterator& b); + friend bool operator!=(const matrix_iterator& a, const matrix_iterator& b); + + // Dereference operator + reference operator*() const; + + // Preincrement operator + matrix_iterator& operator++(); + // Predecrement operator + matrix_iterator& operator--(); + + private: + // Private constructor + matrix_iterator(pointer start, pointer current); + + // Other constructors should probably not be publicly visible, so + // we need to allow matrix access. + friend matrix; + }; +} + +#endif diff --git a/exercises/morton-order/step3/test_matrix_base.cpp b/exercises/morton-order/step3/test_matrix_base.cpp new file mode 100644 index 0000000..e7c1be0 --- /dev/null +++ b/exercises/morton-order/step3/test_matrix_base.cpp @@ -0,0 +1,113 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +bool test_small() { + const int N = 4; + morton::matrix small{N}; + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + small(i, j) = i*N + j; + + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + auto data = small.data(); + const std::vector expected = { + 0, 4, 1, 5, 8, 12, 9, 13, + 2, 6, 3, 7,10, 14,11, 15 + }; + for (auto z : range(N*N)) { + TEST_ASSERT_EQUAL(expected[z], data[z]); + } + return true; +} + +// Helper for making a matrix filled down the diagonal. +// This touches plenty of the pages to ensure mem is actually allocated. +template +morton::matrix make_diag(uint32_t rank) { + auto mat = morton::matrix{rank}; + // fill diagonal + for (auto i: range(rank)) + mat(i,i) = i; + return mat; +} + +bool test_large() { + const int logN = 10; + const int N = 1 << logN; + auto mat = make_diag(N); + + auto data = mat.data(); + uint64_t z = 0; + // pretty easy to convince yourself that the "last" index in each + // successively bigger quad (starting at the origin) is (n^2 - 1) + // where n is the linear size of that. + + // So in the below we're talking about the values 0, 3, 15 + + // 0 1 4 5 + // 2 3 6 7 + // 8 9 12 13 + //10 11 14 15 + + + for (auto i: range(logN+1)) { + auto n = 1 << i; + auto z = n*n - 1; + TEST_ASSERT_EQUAL(n - 1, data[z]); + } + + return true; +} + +bool test_move() { + auto m1 = make_diag(4); + auto m2 = make_diag(8); + + m2 = std::move(m1); + // m1 is now moved-from: we can't do anything except destroy it or + // assign a new value + TEST_ASSERT_EQUAL(4, m2.rank()); + + // Test return of operator== + const morton::matrix& matref = (m1 = std::move(m2)); + // Also test the const element access version of operator() + for (auto i: range(4)) + for (auto j: range(4)) + TEST_ASSERT_EQUAL(matref(i,j), m1(i,j)); + + + return true; +} + +// Try to ensure we really are deleting used memory +bool test_free() { + const int logN = 10; + const int N = 1 << logN; + for (auto j: range(10000)) + auto mat = make_diag(N); + + return true; +} + + +int main() { + static_assert(!std::is_copy_constructible>::value, + "Require that morton matrix is not copyable"); + static_assert(std::is_move_constructible>::value, + "Require that morton matrix is moveable"); + RUN_TEST(test_small); + RUN_TEST(test_large); + RUN_TEST(test_move); + RUN_TEST(test_free); + return 0; +} diff --git a/exercises/morton-order/step3/test_matrix_iter.cpp b/exercises/morton-order/step3/test_matrix_iter.cpp new file mode 100644 index 0000000..6666ebb --- /dev/null +++ b/exercises/morton-order/step3/test_matrix_iter.cpp @@ -0,0 +1,80 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +template +morton::matrix make_filled(int N) { + morton::matrix mat(N); + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + mat(i, j) = i*N + j; + return mat; +} + +// M = matrix or const matrix +template +bool check_mat(M& mat) { + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + // This will store the count of elements visited before the current + // and so should be the Morton index of the element. + int z = 0; + auto it = mat.begin(); + for (; it != mat.end(); ++it, ++z) { + uint32_t i, j; + morton::decode(z, i, j); + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(N*N, z); + + return true; +} + +bool test_mut_iter() { + constexpr int N = 4; + auto mat = make_filled(N); + return check_mat(mat); +} + +bool test_rev_iter() { + const int N = 4; + auto mat = make_filled(N); + + int z = N*N; + auto it = mat.end(); + for (; it != mat.begin();) { + // We want to run the below for it == mat.begin() and then finish + // so move decrement inside the body. + --it; + --z; + + uint32_t i, j; + morton::decode(z, i, j); + + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(0, z); + return true; +} + +int main() { + RUN_TEST(test_mut_iter); + RUN_TEST(test_rev_iter); + return 0; +} diff --git a/exercises/morton-order/step4/Makefile b/exercises/morton-order/step4/Makefile new file mode 100644 index 0000000..3d92d61 --- /dev/null +++ b/exercises/morton-order/step4/Makefile @@ -0,0 +1,13 @@ +include ../config.mk +exes = test_matrix_base test_matrix_iter + +all : $(exes) + +test_matrix_base : test_matrix_base.o + +test_matrix_iter : test_matrix_iter.o + +%.o : matrix.hpp + +clean : + -rm -f *.o $(exes) diff --git a/exercises/morton-order/step4/matrix.hpp b/exercises/morton-order/step4/matrix.hpp new file mode 100644 index 0000000..88e4c0c --- /dev/null +++ b/exercises/morton-order/step4/matrix.hpp @@ -0,0 +1,188 @@ +#ifndef MORTON_MATRIX_HPP +#define MORTON_MATRIX_HPP + +#include +#include +#include +#include +#include "bits.hpp" + +namespace morton { + // Forward declare the iterator + template + class matrix_iterator; + + // 2D square matrix of ints that stores data in Morton order + // + // NB: + // + // - This simple implementation requires that the size be a power + // of 2 (or zero indicating an empty matrix) + // + // - The matrix does not need to be resizeable + // + // - The matrix must not be implicitly copiable, must use explicit + // duplicate member function + template + class matrix { + public: + using value_type = T; + using iterator = matrix_iterator; + // TODO: using const_iterator = ?; + + private: + // rank of matrix + uint32_t _rank; + // Data storage + // Note using array version of unique_ptr + std::unique_ptr _data; + + public: + // Default initialisation of unique_ptr is OK + matrix() = default; + + // Construct with appropriate memory for an r by r matrix. + matrix(uint32_t r) : _rank(r), _data(new value_type[r*r]) { + // Check it's a power of 2. Could consider throwing an + // exception, but these are not in the syllabus! + assert((r & (r-1)) == 0); + } + + // Create a new matrix with contents copied from this one + matrix duplicate() const { + matrix ans(_rank); + std::copy(data(), data() + size(), ans.data()); + return ans; + } + + // Get rank size + uint32_t rank() const { + return _rank; + } + + // Get total size + uint64_t size() const { + return uint64_t(_rank) * uint64_t(_rank); + } + + // Const element access + const value_type& operator()(uint32_t i, uint32_t j) const { + auto z = encode(i, j); + return _data[z]; + } + + // Mutable element access + value_type& operator()(uint32_t i, uint32_t j) { + auto z = encode(i, j); + return _data[z]; + } + + + // Raw data access (const and mutable versions) + const value_type* data() const { + return _data.get(); + } + value_type* data() { + return _data.get(); + } + + // Mutable iterators + iterator begin() { + return iterator(data(), data()); + } + iterator end() { + return iterator(data(), data() + size()); + } + + // Mutable iterators + const_iterator begin() const { + return const_iterator(data(), data()); + } + const_iterator end() const { + return const_iterator(data(), data() + size()); + } + + }; + + // Iterator type for matrices + template + class matrix_iterator { + public: + // Here we exposed the member types required by the standard + // library's concept of an iterator. See: + // https://en.cppreference.com/w/cpp/iterator/iterator_traits + // + // TODO: will these be OK if T is a constant type? (e.g. `int const`) + using value_type = T; + using difference_type = int64_t; + using reference = T&; + using pointer = T*; + // This means we can go forwards and backwards. + // Upgrading to a random access iterator would be eay but require + // quite a lot of additional operations. + using iterator_category = std::bidirectional_iterator_tag; + + private: + // We need the pointer to the first element to work out where we + // are in the matrix for x/y calculation. + pointer _start = nullptr; + pointer _ptr = nullptr; + + public: + // Default constructor + matrix_iterator() = default; + + // Note: must provide copy c'tor, copy assign + // Defaults are fine because this does not own data + + // Get the x/y coordinates of the current element + uint32_t x() const { + auto z = _ptr - _start; + return pack(z); + } + uint32_t y() const { + auto z = _ptr - _start; + return pack(z >> 1); + } + + // Comparison operators. + friend bool operator==(const matrix_iterator& a, const matrix_iterator& b) { + return a._ptr == b._ptr; + } + friend bool operator!=(const matrix_iterator& a, const matrix_iterator& b) { + return !(a == b); + } + + // Dereference operator + reference operator*() const { + return *_ptr; + } + + // Preincrement operator + matrix_iterator& operator++() { + ++_ptr; + return *this; + } + // Predecrement operator + matrix_iterator& operator--() { + --_ptr; + return *this; + } + + private: + // Private constructor + matrix_iterator(pointer start, pointer current) + : _start(start), _ptr(current) { + } + + // Other constructors should probably not be publicly visible, so + // we need to allow matrix access. + // + // TODO: Suppose T = int const. How do get `int` from `int const`? + // See the standard type support library: + // https://en.cppreference.com/w/cpp/types + friend matrix; + }; +} + +#endif diff --git a/exercises/morton-order/step4/test_matrix_base.cpp b/exercises/morton-order/step4/test_matrix_base.cpp new file mode 100644 index 0000000..e7c1be0 --- /dev/null +++ b/exercises/morton-order/step4/test_matrix_base.cpp @@ -0,0 +1,113 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +bool test_small() { + const int N = 4; + morton::matrix small{N}; + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + small(i, j) = i*N + j; + + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + auto data = small.data(); + const std::vector expected = { + 0, 4, 1, 5, 8, 12, 9, 13, + 2, 6, 3, 7,10, 14,11, 15 + }; + for (auto z : range(N*N)) { + TEST_ASSERT_EQUAL(expected[z], data[z]); + } + return true; +} + +// Helper for making a matrix filled down the diagonal. +// This touches plenty of the pages to ensure mem is actually allocated. +template +morton::matrix make_diag(uint32_t rank) { + auto mat = morton::matrix{rank}; + // fill diagonal + for (auto i: range(rank)) + mat(i,i) = i; + return mat; +} + +bool test_large() { + const int logN = 10; + const int N = 1 << logN; + auto mat = make_diag(N); + + auto data = mat.data(); + uint64_t z = 0; + // pretty easy to convince yourself that the "last" index in each + // successively bigger quad (starting at the origin) is (n^2 - 1) + // where n is the linear size of that. + + // So in the below we're talking about the values 0, 3, 15 + + // 0 1 4 5 + // 2 3 6 7 + // 8 9 12 13 + //10 11 14 15 + + + for (auto i: range(logN+1)) { + auto n = 1 << i; + auto z = n*n - 1; + TEST_ASSERT_EQUAL(n - 1, data[z]); + } + + return true; +} + +bool test_move() { + auto m1 = make_diag(4); + auto m2 = make_diag(8); + + m2 = std::move(m1); + // m1 is now moved-from: we can't do anything except destroy it or + // assign a new value + TEST_ASSERT_EQUAL(4, m2.rank()); + + // Test return of operator== + const morton::matrix& matref = (m1 = std::move(m2)); + // Also test the const element access version of operator() + for (auto i: range(4)) + for (auto j: range(4)) + TEST_ASSERT_EQUAL(matref(i,j), m1(i,j)); + + + return true; +} + +// Try to ensure we really are deleting used memory +bool test_free() { + const int logN = 10; + const int N = 1 << logN; + for (auto j: range(10000)) + auto mat = make_diag(N); + + return true; +} + + +int main() { + static_assert(!std::is_copy_constructible>::value, + "Require that morton matrix is not copyable"); + static_assert(std::is_move_constructible>::value, + "Require that morton matrix is moveable"); + RUN_TEST(test_small); + RUN_TEST(test_large); + RUN_TEST(test_move); + RUN_TEST(test_free); + return 0; +} diff --git a/exercises/morton-order/step4/test_matrix_iter.cpp b/exercises/morton-order/step4/test_matrix_iter.cpp new file mode 100644 index 0000000..18cd680 --- /dev/null +++ b/exercises/morton-order/step4/test_matrix_iter.cpp @@ -0,0 +1,88 @@ +#include + +#include "matrix.hpp" +#include "test.hpp" +#include "range.hpp" + +template +morton::matrix make_filled(int N) { + morton::matrix mat(N); + + // Fill with standard C array layout 1D index + for (auto i: range(N)) + for (auto j: range(N)) + mat(i, j) = i*N + j; + return mat; +} + +// M = matrix or const matrix +template +bool check_mat(M& mat) { + // Matrix contains: + // 0 4 8 12 + // 1 5 9 13 + // 2 6 10 14 + // 3 7 11 15 + + // This will store the count of elements visited before the current + // and so should be the Morton index of the element. + int z = 0; + auto it = mat.begin(); + for (; it != mat.end(); ++it, ++z) { + uint32_t i, j; + morton::decode(z, i, j); + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(N*N, z); + + return true; +} + +bool test_mut_iter() { + constexpr int N = 4; + auto mat = make_filled(N); + return check_mat(mat); +} + +bool test_rev_iter() { + const int N = 4; + auto mat = make_filled(N); + + int z = N*N; + auto it = mat.end(); + for (; it != mat.begin();) { + // We want to run the below for it == mat.begin() and then finish + // so move decrement inside the body. + --it; + --z; + + uint32_t i, j; + morton::decode(z, i, j); + + int expect = i*N + j; + TEST_ASSERT_EQUAL(expect, *it); + + TEST_ASSERT_EQUAL(i, it.x()); + TEST_ASSERT_EQUAL(j, it.y()); + } + TEST_ASSERT_EQUAL(0, z); + return true; +} + +// Basic test of const iterator +bool test_const_iter() { + const int N = 4; + const morton::matrix& mat = make_filled(N); + return check_mat(mat); +} + +int main() { + RUN_TEST(test_mut_iter); + RUN_TEST(test_rev_iter); + RUN_TEST(test_const_iter); + return 0; +} diff --git a/exercises/morton-order/test.hpp b/exercises/morton-order/test.hpp new file mode 100644 index 0000000..9cc3bc1 --- /dev/null +++ b/exercises/morton-order/test.hpp @@ -0,0 +1,23 @@ +#ifndef MORTON_TEST_HPP +#define MORTON_TEST_HPP + +#include + +// A few macros to help with basic unit testing + +// (Don't want to introduce any dependencies by using a framework) + +#define TEST_ASSERT_EQUAL(expected, actual) \ + if (expected != actual) { \ + std::cerr << "FAIL! Expected '" << expected \ + << "' Got '" << actual << "'" << std::endl; \ + return false; \ + } + +// Runs a nullary predicate as a test +#define RUN_TEST(tfunc) if (tfunc()) \ + std::cerr << #tfunc << ": ok" << std::endl; \ + else \ + std::cerr << #tfunc << ": FAILED" << std::endl; + +#endif diff --git a/exercises/morton-order/test_bits.cpp b/exercises/morton-order/test_bits.cpp new file mode 100644 index 0000000..b8a4080 --- /dev/null +++ b/exercises/morton-order/test_bits.cpp @@ -0,0 +1,92 @@ +#include +#include "bits.hpp" +#include "test.hpp" + +using namespace morton; + +using p32_64 = std::pair; +const std::vector pdata = { + {0x00000000U, 0x0000000000000000UL}, + {0x00000001U, 0x0000000000000001UL}, + {0x00000002U, 0x0000000000000004UL}, + {0x00000004U, 0x0000000000000010UL}, + {0x00000008U, 0x0000000000000040UL}, + {0x00000010U, 0x0000000000000100UL}, + + {0x0000000fU, 0x0000000000000055UL}, + + {0xffffffffU, 0x5555555555555555UL}, +}; + + +bool test_split() { + for(auto& item: pdata) { + auto res = split(item.first); + // All odd bits must be zero + // 0xa == 0b1010 + auto mask = 0xaaaaaaaaaaaaaaaaUL; + if (mask & res) { + std::cerr << "FAIL! Have a non-zero odd bit in " << res << std::endl; + return false; + } + + TEST_ASSERT_EQUAL(item.second, res); + + } + return true; +} + + +bool test_pack() { + + for(auto& item: pdata) { + auto res = pack(item.second); + TEST_ASSERT_EQUAL(item.first, res); + } + return true; +} + + +const std::vector> enc_data = { + {0, 0, 0}, + {1, 0, 1}, + {0, 1, 2}, + {1, 1, 3}, + + {42, 7, 1134}, + {0x45812369U, 0xa7112504U, 0x983b42030c271461UL}, + {0xffffffffU, 0xffffffffU, 0xffffffffffffffffUL} +}; + +bool test_encode() { + for (auto& item: enc_data) { + auto& x = std::get<0>(item); + auto& y = std::get<1>(item); + auto& z = std::get<2>(item); + + auto res = encode(x, y); + TEST_ASSERT_EQUAL(z, res); + + uint32_t rx, ry; + decode(z, rx, ry); + + TEST_ASSERT_EQUAL(x, rx); + TEST_ASSERT_EQUAL(y, ry); + } + return true; +} + +bool test_shift() { + uint64_t start = 0; + auto res = dec_y(dec_x(inc_y(inc_x(start)))); + TEST_ASSERT_EQUAL(start, res); + return true; + +} +int main() { + RUN_TEST(test_split); + RUN_TEST(test_pack); + RUN_TEST(test_encode); + RUN_TEST(test_shift); + return 0; +} diff --git a/exercises/threads/Makefile b/exercises/threads/Makefile new file mode 100644 index 0000000..60a3368 --- /dev/null +++ b/exercises/threads/Makefile @@ -0,0 +1,28 @@ +# Makefile for mandelbrot area code + +# +# C compiler and options for Intel +# +CC= icpc +CFLAGS = -fast -std=c++11 +LIB= -lm -lpthread + +# +# Object files +# +OBJ= area.o + +# +# Compile +# +area: $(OBJ) + $(CC) -o $@ $(OBJ) $(LIB) + +.cpp.o: + $(CC) $(CFLAGS) -c $< + +# +# Clean out object files and the executable. +# +clean: + rm *.o area diff --git a/exercises/threads/README.md b/exercises/threads/README.md new file mode 100644 index 0000000..f8a502d --- /dev/null +++ b/exercises/threads/README.md @@ -0,0 +1,3 @@ +This document is available in multiple formats: +* [PDF](instructions.pdf) +* [Markdown](instructions.md) diff --git a/exercises/threads/area.cpp b/exercises/threads/area.cpp new file mode 100644 index 0000000..898dd9a --- /dev/null +++ b/exercises/threads/area.cpp @@ -0,0 +1,49 @@ +#include +#include +#include + +using complex = std::complex; + +// Determine if a complex number is inside the set +bool in_mandelbrot(const complex& c) { + const auto MAXITER = 2000; + auto z = c; + for (auto i = 0; i < MAXITER; ++i) { + z = z*z + c; + if (std::norm(z) > 4.0) + return false; + } + return true; +} + +int main() { + const auto NPOINTS = 2000; + + const auto scale_real = 2.5; + const auto scale_imag = 1.125; + const auto eps = 1.0e-7; + const auto shift = complex{-2.0 + eps, 0.0 + eps}; + using clock = std::chrono::high_resolution_clock; + auto start = clock::now(); + + // Outer loops run over npoints, initialise z=c + // Inner loop has the iteration z=z*z+c, and threshold test + int num_inside = 0; + for (int i = 0; i < NPOINTS; ++i) { + for (int j = 0; j < NPOINTS; ++j) { + const auto c = shift + complex{(scale_real * i) / NPOINTS, + (scale_imag * j) / NPOINTS}; + if (in_mandelbrot(c)) + num_inside++; + } + } + auto finish = clock::now(); + + // Calculate area and error and output the results + auto area = 2.0 * scale_real * scale_imag * double(num_inside) / double(NPOINTS * NPOINTS); + auto error = area / double(NPOINTS); + + std::printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n", area, error); + auto dt = std::chrono::duration(finish-start); + std::printf("Time = %12.8f seconds\n", dt.count()); +} diff --git a/exercises/threads/instructions.md b/exercises/threads/instructions.md new file mode 100644 index 0000000..7e909da --- /dev/null +++ b/exercises/threads/instructions.md @@ -0,0 +1,32 @@ +# C++ threads +## Mark Bull +## m.bull@epcc.ed.ac.uk + +Source for this can be obtained from Github. Get a new copy with: + +``` +git clone https://github.com/EPCCed/APT-CPP +``` + +or update your existing one with + +``` +git pull +``` + +then you can + +``` +cd APT-CPP/exercises/threads +``` + + + +`area.cpp` contains a C++ version of the Mandelbrot example which you used in Threaded Programming. + +Parallelise the outer loop of the main computation using C++ +threads. Try using either a function pointer or a lambda expression. + +Try different mechanisms for synchronising the update to the shared +counter: a mutex, and lock guard or an atomic int. + diff --git a/exercises/threads/instructions.pdf b/exercises/threads/instructions.pdf new file mode 100644 index 0000000..89d5995 Binary files /dev/null and b/exercises/threads/instructions.pdf differ diff --git a/lectures/README.md b/lectures/README.md new file mode 100644 index 0000000..c7635af --- /dev/null +++ b/lectures/README.md @@ -0,0 +1,11 @@ +# Lectures + +* [A brief introduction to C++](cpp-intro) +* [Class types](classes) +* [Loops, containers, and iterators](loops-containers) +* [Managing resources](resources) +* [Templates for generic programming](templates) +* [Algorithms and lambdas](algorithms-lambdas) +* [Linear algebra with Eigen](eigen) +* [Threads with C++](threads-1) and [Further topics with threads](threads-2) +* [C++ frameworks for portable performance](frameworks-kokkos) diff --git a/lectures/algorithms-lambdas/README.md b/lectures/algorithms-lambdas/README.md new file mode 100644 index 0000000..1e884f8 --- /dev/null +++ b/lectures/algorithms-lambdas/README.md @@ -0,0 +1,416 @@ +template: titleslide + +# Algorithms and lambdas +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +--- + +template: titleslide +# Recap +--- + +# Iterators +```C++ +std::vector data = GetData(n); + +// C style iteration - fully explicit +for (auto i=0; i != n; ++i) { + data[i] *= 2; +} +// Old C++ style - hides some details +for (auto iter = data.begin(); iter != data.end(); ++iter) { + *iter *= 2; +} +// New range-based for +for (auto& item : data) { + item *= 2; +} +``` + +--- +template: titleslide +# Standard algorithms library +--- + +# Standard algorithms library + +The library includes many (around 100) function templates that +implement algorithms, like "count the elements that match a criteria", +or "divide the elements based on a condition". + +These all use *iterators* to specify the data they will work on, e.g., +`count` might use a vector's `begin()` and `end()` iterators. + +```C++ +#include +std::vector data = Read(); +int nzeros = std::count(data.begin(), data.end(), + 0); + +bool is_prime(int x); +int nprimes = std::count_if(data.begin(), data.end(), + is_prime); +``` +--- +# Standard algorithms library + +Possible implementation: +```C++ +template +intptr_t count_if(InputIt first, InputIt last, + UnaryPredicate p) { + intptr_t ans = 0; + for (; first != last; ++first) { + if (p(*first)) { + ans++; + } + } + return ans; +} +``` + +(Unary `\(\implies\)` one argument, a predicate is a Boolean-valued +function.) + +--- + +# Key algorithms + +|Algorithm | Description | +|--|---| +| `for_each` | Apply function to each element in the range. | +| `count`/`count_if`| Return number of elements matching. +| `find`/`find_if` | Return iterator to first element matching or end if no match. | +| `copy`/`copy_if` | Copy input range (if predicate true) to destination | +| `transform` | Apply the function to input elements storing in destination (has overload work on two inputs at once) | +| `swap` | Swap two values - used widely! You may wish to provide a way to make your types swappable (see cpprefence.com) | +| `sort` | Sort elements in ascending order using either `operator<` or the binary predicate provided | +| `lower_bound`/ `upper_bound` | Given a *sorted* range, do a binary search for value. | + +--- + +# `for_each` + +One of the simplest algorithms: apply a function to every element in a +range. +```C++ +template< class InputIt, class UnaryFunction > +UnaryFunction for_each(InputIt first, + InputIt last, + UnaryFunction f); +``` + +Why bother? + +- Clearly states your intent + +- Cannot get an off-by-one errors / skip elements + +- Works well with other range-based algorithms + +- Concise if your operation is already defined in a function + +However often a range-based for loop is better! + +--- +# `transform` + +A very powerful function with two variants: one takes a single range, +applies a function to each element, and stores the result in an output +iterator. +```C++ +template +OutputIt transform(InputIt first1, InputIt last1, + OutputIt d_first, + UnaryOperation unary_op ); +``` + +This is basically the equivalent of `map` from functional programming or MapReduce. + +```C++ +std::vector data = GetData(); +std::transform(data.begin(), data.end(), + data.begin(), double_in_place); +``` + +You can use your input as the output. + +--- + +# Motivation + +Implementations have been written *and tested* by your compiler +authors. + +The library may be able to do platform-specific optimizations that you +probably don't want to maintain. + +They form a language to communicate with other programmers what your +code is really doing. + +It's nice code that you don't have to write. + +```C++ +for (auto it = images.begin(); + it != images.end(); ++it) { + if (ContainsCat(*it)) { + catpics.push_back(*it); + } +} +``` +vs +```C++ +std::copy_if(images.begin(), images.end(), + ContainsCat, std::back_inserter(catpics)); +``` + +--- +template:titleslide + +# Lambda functions +--- + +# Algorithms need functions + +Very many of the algorithms just discussed need you to provide a +function-like object as an argument for them to use. + +If you have to declare a new function for a one-off use in an algorithm +call that is inconvenient and moves the code away from its use site. + +Worse would be to have to create a custom function object class each +time! + +--- +# A verbose example + +```C++ +struct SquareAndAddConstF { + float c; + SquareAndAddConstF(float c_) : c(c_) {} + + float operator()(float x) { + return x*x + c; + } +}; + +std::vector SquareAndAddConst(const std::vector& x, float c) { + std::vector ans; + ans.resize(x.size()); + + std::transform(x.begin(), x.end(), ans.begin(), + SquareAndAddConst(c)); + return ans; +} +``` +--- + +# Lambdas to the rescue + +- A lambda function a.k.a. a closure is a function object that does + not have a name like the functions you have seen so far. + +- You can define one inside a function body + +- You can bind them to a variable, call them and pass them to other + functions. + +- They can capture local variables (either by value or reference). + +- They have a unique, unknown type so you may have to use `auto` or + pass them straight to a template argument. + +--- + +# A less verbose example +```C++ +std::vector SquareAndAddConst(const std::vector& x, float c) { + std::vector ans; + ans.resize(x.size()); + auto func = [c] (double z) { return z*z + c; }; + std::transform(x.begin(), x.end(), ans.begin(), + func); + return ans; +} +``` + +--- + +# A less less verbose example +```C++ +std::vector SquareAndAddConst(const std::vector& x, float c) { + std::vector ans; + ans.resize(x.size()); + std::transform(x.begin(), x.end(), ans.begin(), + [c] (double z) { return z*z + c; }); + return ans; +} +``` + +--- +# Anatomy of a lambda +``` +[captures](arg-list) -> ret-type {function-body} +``` +| | | +|---|----| +| `[ ... ]` | new syntax that indicates this is a lambda expression | +| `arg-list` | exactly as normal | +| `function-body` | zero or more statements as normal | +| `-> ret-type` | new C++11 syntax to specify the return type of a function - can be skipped if return type is void or function body is only a single `return` statement. | +| `captures` | zero or more comma separated captures | + + +You can capture a value by copy (just put its name: `local`) or by reference +(put an ampersand before its name: `&local`). + +--- +# Anatomy of a lambda + +``` +[captures](arg-list) -> ret-type {function-body} +``` + +This creates a function object of a unique unnamed type (you +must use `auto` to store it in a local variable). + +You can call this like any object that overloads `operator()`: +```C++ +std::cout << "3^2 +c = " << func(3) << std::endl; +``` + +Note that because it does not have a name, it cannot take part in +overload resolution. + +--- + +# Quick Quiz + +What does the following do? +```C++ + [](){}(); +``` +-- + +Nothing + +--- + +# Uses + +- STL algorithms library (or similar) - pass small one-off pieces of + code in freely + +```C++ +std::sort(molecules.begin(), molecules.end(), + [](const Mol& a, const Mol& b) { + return a.charge < b.charge; + }); +``` + +- To do complex initialisation on something, especially if it should + be `const`. + +```C++ +const auto rands = [&size] () -> std::vector { + std::vector ans(size); + for (auto& el: ans) { + el = GetRandomNumber(); + } + return ans; +}(); // Note parens to call! +``` + +--- +# Rules of thumb + +Be careful with what you capture! + +![:thumb](If your lambda is used locally - including passed to +algorithms - capture by reference. This ensures there are no copies +and ensures any changes made propagate.) + +![:thumb](If you use the lambda elsewhere, especially if you return +it, capture by value. Recall references to locals are invalid after +the function returns! References to other objects may still be valid +but hard to keep track of.) + +![:thumb](Keep lambdas short - more than 10 lines you should think +about moving it to a function/functor instead.) + + +--- +template: titleslide +# Performance + +??? + +Many people are a bit concerned that using iterators/lambdas etc +incurs some overhead - after all you're calling a bunch of +functions. So let's benchmark them + +--- + +# Many ways to iterate +```C++ +// C style iteration - fully explicit +for (auto i=0; i != n; ++i) { + data[i] *= 2; + } + +// Old C++ style - hides some details +for (auto iter = data.begin(); iter != data.end(); ++iter) { + *iter *= 2; +} + +// New range-based for +for (auto& item : data) { + item *= 2; +} + +// Algorithms library +std::for_each(data.begin(), data.end(), + double_in_place); +``` + +--- +# Is there any overhead? + +Going to quickly compare four implementations to scale by 0.5: + +- C-style array indexing + +- Standard vector with iterator + +- Standard vector with range based for-loop + +- Standard vector with std::for_each + +```C++ +int main(int argc, char** argv) { + int size = std::atoi(argv[1]); + std::vector data(size); + for (auto& el: data) + el = rand(1000); + Timer t; + scale(data.data(), data.size(), 0.5); + std::cout << size << ", " + << t.GetSeconds() << std::endl; +} +``` + +--- +# Results +.center[![-O0](looptests/opt0.svg)] + +--- +# Results +.center[![-O2](looptests/opt2.svg)] + +--- +# Exercise + +See [exercises/algorithm](../../exercises/algorithm) + diff --git a/lectures/algorithms-lambdas/index.html b/lectures/algorithms-lambdas/index.html new file mode 100644 index 0000000..b2fe5d7 --- /dev/null +++ b/lectures/algorithms-lambdas/index.html @@ -0,0 +1,31 @@ + + + +C++ for numerical computing part 2 + + + + + + + + + + + + + + + diff --git a/lectures/algorithms-lambdas/looptests/opt0.svg b/lectures/algorithms-lambdas/looptests/opt0.svg new file mode 100644 index 0000000..19477d2 --- /dev/null +++ b/lectures/algorithms-lambdas/looptests/opt0.svg @@ -0,0 +1,1489 @@ + + + + + + + + 2023-02-24T11:49:36.314145 + image/svg+xml + + + Matplotlib v3.4.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lectures/algorithms-lambdas/looptests/opt2.svg b/lectures/algorithms-lambdas/looptests/opt2.svg new file mode 100644 index 0000000..274a421 --- /dev/null +++ b/lectures/algorithms-lambdas/looptests/opt2.svg @@ -0,0 +1,1401 @@ + + + + + + + + 2023-02-24T11:49:37.044643 + image/svg+xml + + + Matplotlib v3.4.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lectures/classes/README.md b/lectures/classes/README.md new file mode 100644 index 0000000..5d2ae97 --- /dev/null +++ b/lectures/classes/README.md @@ -0,0 +1,818 @@ +template: titleslide +# Classes +## Rupert Nash, EPCC +## r.nash@epcc.ed.ac.uk + +--- +# Compiler explorer (Godbolt) + +A very useful tool is the Compiler Explorer - https://godbolt.org + +Type some code on the left and it will compiled with your choice of +very many compilers. + +Will show compiler errors/warnings/reports and can execute your +program. + +Can get shareable links to your code. + +??? + +The name comes from the creator, Matt Godbolt + +--- +# User defined types + +Called "class types" formally - can be defined with either the `class` +or `struct` keyword. + +```C++ +struct Complex { + double re; + double im; +}; +``` + +Let's show this on Compiler Explorer: https://godbolt.org/z/j8Kf31T89 + +This style of type is often called a "plain old data" (POD) type or a +"trivial" type. + +This simply aggregates together several *members* which have a name +and type. You can then pass them around together and access the data. + +??? + +Talked already about the built in types and one of the library +provided types (`std::string`) + + +--- +# Class types + +Creating trivial types - give the class name then list the values to be assigned to the +members, _in order_, inside braces: + +```C++ +Complex mk_imaginary_unit() { + return Complex{0, 1}; +} +``` +This is called aggregate initialisation. + +Alternatively: +```C++ +Complex mk_imaginary_unit() { + Complex sqrt_m1; // Values are uninitialised + sqrt_m1.re = 0; + sqrt_m1.im = 1; + return sqrt_m1; +} +``` + +--- +# Class types + +Using trivial types: + +```C++ +void test() { + Complex z = mk_imaginary_unit(); + assert(z.re == 0); + assert(z.im == 1); +} +``` + +Any piece of code can access the member variables inside the object. + +??? + +This is probably something you're familiar with from other languages + +--- +template: titleslide +# Creating instances + +--- +# Default member initialisers + +In the class definition you can (should?) provide default values for +member variables: + +```C++ +struct Complex { + double re = 0.0; + double im = 0.0; +}; +``` + +Now when you create an object it will be initialised to a know state +for you: + +```C++ +void test() { + Complex z; + assert(z.re == 0); + assert(z.im == 0); +} +``` + +??? + +Why "should"? + +The compiler will generate code for you to set the defaults whenever +you create these objects - can't forget to do this. + +Why not should? + +Perhaps your types act as much like the builtin types as +possible. Initialising members incurs a RUNTIME COST which perhaps you +don't wish to pay. + +--- +# Default member initialisers + +A note for C++11: you lose the ability to set the member variables by +just providing the values: + +```C++ +struct Complex { + double re = 0.0; + double im = 0.0; +}; + +Complex mk_imaginary_unit() { + return Complex{0, 1}; +} +``` + +GCC says: +``` +error: no matching constructor for initialization of 'Complex' + return Complex{0,1}; +``` + +--- +# Constructors +Often you want to control the creation of instances of your classes. + +You do this with _constructors_ - these are special member "functions" +with the same name as the type. + +```C++ +struct Complex { + Complex() = default; + Complex(double re); + Complex(double re, double im); + double re = 0.0; + double im = 0.0; +}; +``` + +Note we declare three: +- one that initialises with a purely real value +- one that initialises with a real and imaginary value +- a *default constructor* which needs no arguments (that we tell the + compiler to generate for us as before with `= default` ) + +??? + +Control in more detail than just starting from a default value or +having to provide *all* the of member values. + +Constructors are not strictly functions in C++ but very nearly (next slide) + +Why do you have to "explictly default the default constructor"? + +Because the language rules say if the user provides any constructors, +the compiler must not create one unless asked to... + +--- +# Constructors + +- Constructors are not directy callable +- Constructors do not return a value +- Constructors can do initialisation of member variables before the body begins execution + + +Let's define the ones we declared just now: + +```C++ +Complex::Complex(double real) : re{real} { +} + +Complex::Complex(double real, double imag) : re{real}, im{imag} { +} +``` + +??? + +Point out the member initialiser syntax after the colon + +Assert that this is where you want to do as much initialisation as you +possibly can. + + +--- +# Creating complexes + +We can now create complex numbers different ways +```C++ +auto zero = Complex{}; // Uses default member initialisers +auto pi = Complex{3.14159}; // pi.im == 0.0 +auto sqrt_minus_one = Complex{0, 1}; +Complex one = 1.0; +``` + +--- +template: titleslide +# Member functions + +--- +# Object-oriented programming + +Object oriented programming is one of the major paradigms supported by +C++ + +> OOP is based on the concept of "objects", which may contain data and +> code. A feature of objects is that an object's procedures can access +> and often modify the data of the object with which they are +> associated. + +In C++ these attached bits of code are known as member functions - +other languages might call them methods + +For example: + +```C++ +int main() { + std::string name; + std::cin >> name; + std::cout << "Hello, " << name << ". " + << "Your name has " << name.size() << " characters." << std::endl; +} +``` +??? + +Quote from wikipedia + +Note that an object in C++ may not be an object in the OOP sense! + +--- +# Member functions + +Typically these are declared in the class definition... + +```C++ +// complex.hpp +struct Complex { + // Constructors as before + double magnitude() const; + + double re = 0.0; + double im = 0.0; +}; +``` + +??? + +If anyone asks, discussion of const is coming up! + +--- +# Member functions + +... and defined out of line + +```C++ +// complex.cpp +double Complex::magnitude() const { + return std::sqrt(re*re + im*im); +} +``` + +Within a member function we can access the members (data or function) +by just giving their names. + +Outside the class, we need to have an instance to use: + +```C++ +Complex z{3, 4}; +assert(z.magnitude() == 5.0); +``` + +??? + +The compiler inserts an implicit argument referring to the current +instance for us + +--- +# More on operator overloading + +Complex numbers have the usual arithmetic operations: `\(+-\times\div\)` + +We can provide operator overloads, like: + +```C++ +struct Complex { + // Other members... + + Complex& operator+=(Complex const & increment) { + re += increment.re; + im += increment.im; + return *this; + } +}; +Complex operator+(Complex const& a, Complex const& b) { + return Complex{a.re+b.re, a.im+b.im}; +} +``` + +??? + +Recall that these are just functions (with funny names) + +We have a member function `operator+=` and a non-member (aka free +function) + +The compiler is already implicitly turning `a+b` into plus(a, b) +internally. + +If anyone asks, references and `const` coming up + +--- +# More on operator overloading + +Complex numbers have the usual arithmetic operations +(`\(+-\times\div\)`) and comparisons. + +We can now use the natural syntax to add these values + +```C++ +Complex i{0, 1}; + +Complex z; +z += i; + +assert(z.re == 0 && z.im == 1); + +auto c = z + i; +assert(c == 2*i); +``` + +??? + +We could also overload multiplication and equality comparison as shown +in the last line + +Go look up the complete list on CPP Reference! + +--- +# Classes and structs + +You define a class using either the `struct` or `class` keywords - +technically the difference is only the default _accessibility_ of +members. + +Members can be: + +- `public`: usable from any piece of code - i.e. the public interface + of the class - default for `struct` + +- `private`: only usable from with a the context of the class - + i.e. an implementation detail of the class that should be of no + interest to code that uses it - default for `class` + +Restricting access to implementation details is part of encapsulation: +another common aspect of OOP + + +??? + +Context of a class means from inside a member function or from +elsewhere in the class scope + + +--- +# Access Control + +```C++ +class Greeter { + std::string greetee; + void say_hello() const { + std::cout << "Hello, " << greetee << std::endl; + } +}; + +void test(Greeter const& g) { + g.say_hello(); +} +``` +??? + +Compiler will say something like +`error: 'say_hello' is a private member of 'Greeter'` + +--- +# Access Control + +```C++ +class Greeter { + std::string greetee; +public: + void say_hello() const { + std::cout << "Hello, " << greetee << std::endl; + } +}; + +void test(Greeter const & g) { + g.say_hello(); +} +``` +??? + +To fix this need to add the `public` access specifier + +Why is encapsulation worth bothering with? Enforces modularity and can +(sometimes) swap out parts of the implementation for e.g. performance +without have to re-write the client code + +A class can declare another function (or class) its `friend` which +allows only that bit of code to touch its private members. + +This is a controlled, partial relaxation of encapsulation that often +makes the whole system more isolated. + + +--- +template: titleslide +# Constants + +--- +# Const-ness + +Variables can be qualified with the `const` keyword + +- objects marked with `const` cannot be modified + +- compiler will give errors if you try + +```C++ +int main() { + int const i = 42; + std::cout << i << std::endl; + // prints 42 + + i += 10; + // error: cannot assign to variable 'i' with const-qualified type 'const int' +} +``` + +You should be adding `const` to your local variables whenever possible. + +??? + +We can have a few degrees of constantness + +1. known at compile time: the value of pi is + always 3.14159... Compiler might be able to help us here + +2. local variable: the variable gets a single value for that one + execution of that block of code. Compiler probably won't speed it + up, but it helps us humans: + * You are giving something a *name* so you can refer to it again. It + means you won't accidentally change something + * give it a proper name: `local_mean_density` not `d` + * Maybe do this even if you only use it once (compiler will + probably optimise it away) + +--- +# Const-ness + +Since function parameters are just local variables, they can be +`const` too. + +```C++ +Complex operator-(Complex const a, Complex const b) { + return Complex{a.re - b.re, a.im - b.im}; +} +``` + +Since this function has no need to modify it's arguments, they should +be marked as `const` + +??? + +Deliberately omitting the `&` for references - that's next + + +--- +# Const-ness + +Member functions have an implicit argument of the instance they +"belong" to. + +If they do not need to change an instance, then they should be marked +`const` also: + +```C++ +struct Complex { + double magnitude() const; +}; +``` + +??? + +`this` pointer, if anyone asks, but we don't like pointers + + +Member variables can be marked `const`, but don't do it + +--- +# East-const vs West-const + +As far as the compiler and language specification are concerned, both +of these are identical: + +.columns[ +.col[ +```C++ +const int i = 42; +``` + +Rule: `const` applies to what is on its left, unless there is nothing on its left, then it applies to what is on its right + +] +.col[ +```C++ +int const i = 42; +``` +Rule: Always put `const` on the right of what it modifes +] +] + +It doesn't really matter as long as you're **consistent** within a project. + +??? + +East const is a little more logical hence using it here! + +Older codebases (and C++ programmers) will be West const (e.g. RWN) + +--- +template: titleslide +# References + +--- +# Copying + +By default, if assign a variable a new value, C++ will copy it +```C++ +double copy = original; +``` + +Same if you pass a value of some type to a function, it will be +copied to a new local variable for the invocation of a function: + +```C++ +SimState AdvanceOneTimeStep(SimState old); + +// later... +next_state = AdvanceOneTimeStep(current_state); +``` + +??? +Exactly what copying means for a class type is up to the programmer - +forms part of the interface + +If you're doing some complex simulation like computational fluid +dynamics/molecular dynamics, the `current_state` will be large and be +expensive (in time and memory) to copy. + +--- +# References + +C++ has the concept of a **reference**. + +- A reference is a name that aliases an already existing object + +- A reference must be initialised (function arguments are initialised + by calling) + +- A reference cannot be rebound to a new object + +```C++ +void ScaleVector(double scale, std::vector& x) { + // Multiply every element of x by scale +} + +void test() { + std::vector data = ReadLargeFile(); + ScaleVector(-1.0, data); +} +``` + +??? + +We qualify the type with the ampersand `&` after the type and before +the variable name. + +When I say must be intialised, mean that you can't just declare a +reference variable without initialising it + +At the call site the reference is bound to parameter and the function +can refer to `data` without copying it + +This is close to the way that Fortran passes arguemnts to subroutines + +--- +# References and Const + +You can qualify a reference as const: + +```C++ +SimState AdvanceOneTimeStep(SimState const& old) { + // do some science... + return new_state; +} +``` + +??? + +This means that the program cannot modify the contents of `old` +through that name. + +The compiler will give an error if you try to change it (or a call a +function that could change it). + +--- +# Functions can return a reference + +Many classes have data members that they want to let their client have +some access to without (necessarily) copying + +Or access to a single value from the many they might contain + +```C++ +class AtomList { + std::vector position; + std::vector velocity; + std::vector charge; +public: + std::vector const& GetCharge() const { + return charge; + } +}; + +void analyse_md_data() { + AtomList const atoms = ReadFromFile(); + std::vector const& charges = atoms.GetCharge(); + std::cout << "Average charge = " + << std::accumulate(charges.begin(), charges.end()) / charges.size() + << std::endl; +} +``` + +??? + +So when we call the `GetCharge` the local var `charges` refers to the +same data contained inside `atoms` without doing any copying + +Returning references is very common - especially for containers that +we'll talk about later. + +--- +# References and const with `auto` + +When you declare a variable with `auto` you can qualify it it as a +reference and/or `const`: + +- Use `auto x` when you want a modifiable copy + +- Use `auto&& x` when you want a reference and the same `const`-ness + as the initialiser + +- Use `auto const &x` when you want a reference to original item and + will not modify it + +![:thumb](Use the last whenever possible) + +??? + +Note double ampersand in 2nd point - it's called a forwarding reference and we will +come back to it + +Refer back to the last slide codes and ask + +- what happens if instead of `std::vector const&` we put + `auto charges`? (charges deduces to `std::vector` and we copy) + +- `auto const & charges` (charges deduces just that same as written - + we have a reference) + +- `auto&& charges`? (exactly as before but *if* we had returned a + mutable reference then it would be a mutable ref while `auto const&` + would have stayed constant) + + +--- +template: titleslide +# C++ compilation + +--- +# Declarations vs Definitions + +C++ distinguishes between *declaration* and *definition*. + +A **declaration** tells the compiler that a name exists, what kind of +entity it refers to and (if it is a function or variable) its +type. For most uses this is all the compiler needs. Declarations can +be repeated, as long as they match *exactly*. + +A **definition** tells the compiler everything it needs to create +something in its entirety. A definition is also a declaration. The +one-definition rule says that definitions must not be repeated (with +an exception). + +??? + +The exceptions being templates and `inline` functions if anyone asks + +--- +# Where to put these + +- Conventionally, one puts declarations of functions, definitions of + classes, and global constants in **header** files. + + - Common suffixes are: `.hpp`, `.h`, `.H` + +- Definitions of most functions should be in **implementation** files + + - Common suffixes are `.cpp`, `.cxx`, `.cc`, `.C` + +- Headers can be be `#include` into other files that need to use the + types and function declared there. + +??? + +Suffixes mostly meaningless to the compiler but don't surprise people! + +Prefer earlier in the list + +--- +# Where to put these + +E.g. Complex.hpp: + +```C++ +#ifndef COMPLEX_HPP +#define COMPLEX_HPP + +struct Complex { + Complex() = default; + // etc... +private: + double re; + double im; +}; + +Complex operator+(Complex const& a, Complex const& b); +// Etc... + +#endif +``` +??? + +Draw attention to the include guard idiom + +--- +# Exercise + +In your clone of this repository, find the `complex` exercise and list +the files + +``` +$ cd APT-CPP/exercises/complex +$ ls +Makefile complex.cpp complex.hpp test.cpp +``` + +The files `complex.hpp` and `complex.cpp` contain a partially working +complex number class and `test.cpp` holds some basic unit tests. + +You can compile and run with: +``` +$ make && ./test +c++ --std=c++17 -I../include -c -o complex.o complex.cpp +c++ --std=c++17 -I../include -c -o test.o test.cpp +c++ complex.o test.o -o test +=============================================================================== +All tests passed (34 assertions in 6 test cases) +``` + +But to get to this point you need to complete the code and fix a few +bugs! + +??? + +Ask who knows about make maybe? diff --git a/lectures/classes/index.html b/lectures/classes/index.html new file mode 100644 index 0000000..32819ae --- /dev/null +++ b/lectures/classes/index.html @@ -0,0 +1,31 @@ + + + +Classes in C++ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/cpp-intro/README.md b/lectures/cpp-intro/README.md new file mode 100644 index 0000000..c616096 --- /dev/null +++ b/lectures/cpp-intro/README.md @@ -0,0 +1,580 @@ +template: titleslide + +# A brief introduction to C++ +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +--- + +template: titleslide +# Introduction + +--- +# Assumptions + +- You are a decent programmer in another language + +- You know how to use the shell + +- You have access to a terminal in front of you with a C++ compiler + +--- +# What this is not! + +Writing efficient software, more than anything, requires you to choose +an appropriate algorithmic approach for your problem. + +Here we want to take a lower-level approach and talk about how to +implement patterns efficiently using C++. + +??? + +For lots on how to do this well, consider other courses such as the Parallel Design Patterns +course from the MSc + +--- +# What is "scientific computing"? + +Both HPC and data science, when you actually come to running a +program, are about getting a large amount of data from memory to a +core, doing something useful to it, and storing it again. + +This is why FORTRAN is still relevant! + +But it does force you to confront this all time. + +C++ is all about building abstractions and composing them. + +![:thumb](I will talk about a few and give some suggestions of +default rules) + +--- +# These few lectures + +We could spend a whole semester going in depth on C++, so we've picked +a handful of features to cover that you really need to write modern +C++ for technical computing. + +This is not trying to teach the *whole language* in the many different +styles that people have developed over the decades. + +Please ask questions any time! + +--- +# The misunderstood monster + +.center[![:scale_img 30%](frank_mon.jpg)] + +By Universal Studios - Dr. Macro, Public Domain, +https://commons.wikimedia.org/w/index.php?curid=3558176 + +--- +# The misunderstood monster + +- Large: the C++20 standard is just over 1800 pages + +- Composed of many parts: C, classes, generics, functional + programming, exceptions, the vast library, ... + +- Inspires dread in those do not understand it + +- Dangerous: +> "C makes it easy to shoot yourself in the foot; +> C++ makes it harder, but when you do it blows your whole leg +> off." - Bjarne Stroustrup + +- "Expert friendly" + +--- +# Octopus vs Swiss Army knife + +> "C++ is an octopus made by nailing extra legs onto a dog" - Steve Taylor + +.center[![:scale_img 40%](octodog.jpg) ![:scale_img 40%](sak.jpg)] + +But you can cut off some extra legs to get the right dog for your +program! + +??? + +Why is it called C++20? + +Because that's how many legs they had to nail on to "fix" the octopus. + +--- +# The philosophy of C++ (community) + +- General purpose + +- Flexible by allowing developers to build abstractions (and provides + a large number through the library) + +- Performance and efficiency are always targeted "you only pay for + what you use" + +- Use the powerful type system to express intent + +- Communicate with the reader, not the compiler + +--- +# C++ is alive! + +C++ is a work in progress. + +Every three years there is a new update to the International Standard + +Latest one, C++20, still not fully implemented by an compiler. Major +new features are ranges, coroutines, concepts, and modules + +Next one, in 2023, well underway. Major features likely to include +networking, string formating, executors, and consolidation of new +C++20 features + +--- +# References +- Bjarne Stroustrup, "Programming: Principles and Practice Using C++" + (2nd Ed.). Assumes very little but it's long + +- Bjarne Stroustrup, "A Tour of C++". Assumes you're an experience + programmer and is quite brief - targets C++17 + +- Best online *reference* is + (comes in other human languages too!) + +- Scott Meyers, "Effective Modern C++", 2014. This is *the* book to + get once you know your way around C++, but you want to improve. + Teaches lots of techniques and rules of thumb for writing correct, + idiomatic, maintainable code. + +- [stackoverflow.com](stackoverflow.com) has a lot of good questions + about C++ (look for ones with at least 100 up-votes). + +--- +template: titleslide +# But first\... + +--- +name: hello +# Hello! + +```C++ +#include + +int main() { + std::cout << "Hello, world!" << std::endl; +} +``` + +--- +template: hello + +``` +$ g++ --std=c++17 hello.cpp -o hello +$ ./hello +Hello, world! +``` + +--- +template: hello + +- The preprocessor runs first + +- The `#include` directive copies the contents of another file into + the current compilation unit. + +- The angle brackets `<...>` tell it to look only in system + directories. + +- This includes the `iostream` standard library header + +--- +template: hello + +- Every program must have exactly one main function. + +- The compiler and OS arrange for this to be called when you run it. + +- (Not shown, but you can also get the command line arguments) + +--- +template: hello + +- `std` is the standard library namespace. + +- A namespace allows scoping of names (much like a filesystem has + directories). + +- The scope resolution operator `::` lets us access something from + inside a namespace. + +- `cout` represents console output (i.e. standard output / stdout) + +--- +template: hello + +- The standard library uses the bitwise left shift operator (`<<`) to + mean stream insertion + +- I.e. output the right hand side to the left. + +- Every statement in C++ must be terminated with a semicolon (`;`) + +- The language treats all whitespace (space, tab, line break) the same + +--- + +template: titleslide +# Your turn + +--- +# Machine choice + +You can use your laptop, Cirrus, or ARCHER2 + +__Your machine__ : you need a C++ compiler that supports at least +C++17. If you use Windows and MSVC we may not be able to help +much... Sorry! + +__Cirrus__: You have log in details. + +Once connected you need to load the up-to-date compilers: +``` +module load gcc/8.2.0 +``` + +__ARCHER2__: You have log in details. + +Once connected you need to load the up-to-date compilers: +``` +module load gcc/10.2.0 +``` + +--- +# Getting the source code + +All these slides and the exerices are available on GitHub: +https://github.com/EPCCed/APT-CPP + +You also can view the slides and other materials in a browser by going +to https://EPCCed.github.io/APT-CPP/ + +In the terminal, you need to use git get a copy of the exercises: +``` +git clone https://github.com/EPCCed/APT-CPP +``` + +Then you can change to the directory with this simple program +``` +cd APT-CPP/lectures/cpp-intro/hello +``` + +--- +# Say hello + +View the program source + +``` +vim hello.cpp +emacs hello.cpp +``` + +Compile the program: + +``` +g++ --std=c++17 hello.cpp -o hello +``` + +No output means success! + +Run it: +``` +./hello +``` + +--- +template: titleslide +# A few things + +--- +# C++: + +- It is a typed language - all variables must be declared + - But you can often tell the compiler to figure out the type + +- Counts from zero (like C, Python) not from one (like Fortran) + +--- +template: titleslide +# Variables + +--- +# Variables + +A variable is + +- an object + +or + +- a reference + +that is declared to have a type and a name. + +??? + +We'll talk about references later + +--- + +# Objects + +An object is a region of storage that has: + +- type +- size +- value +- lifetime + +??? + +types are next, explain a bit about the others + +--- + +template: titleslide +# Types + +--- +# What is a type? + +> "Objects and expressions have a property called type, which +> both restricts the operations that are permitted for those entities +> and provides semantic meaning to the otherwise generic sequences of +> bits." -- + +--- +# Fundamental types + +| Type | Description | +|-------------|-------------| +| `void` | Nothing - used to indicate a function returns no value.| +| `bool` | `true` or `false` | +| `int` | Standard *signed* integer for your machine. *At least* 16bits. *Usually* 32 bits.| +| `double` | Double-precision floating point. *Usually* an IEEE 754 64 bit number.| +| `std::byte` | Raw untyped memory | + +There are also `unsigned` versions of the integer types + +The header `` provides fixed-width integer types available on +your implementation: e.g. `std::int32_t` and `std::uint64_t`. + +??? + +There are others: +- single precision floats +- short/long/long long ints + +Go look them up on CPP Reference as you need to + +--- + +# Strings + +The standard library has a class* called `string` that holds a string +of text characters. + +You have to `#include ` to use it which includes the "header +file" that contains all the information the compiler needs to let you +use it. + +``` +#include +#include + +int main() { + std::string message = "Hello, world"; + std::cout << message << std::endl; +} +``` + +??? +Character encoding in the standard library is a bit of a mess. + +Partially fixed in C++20 + +Find a library if you need to do serious text handling because Unicode +is super complicated + +--- +template: titleslide +# Functions + +--- +# Functions + +A function encapsulates a piece of code between braces (curly +brackets, `{}`) and gives it a name so you can use it later. + +```C++ +void say_hello() { + std::cout << "Hello, world!" << std::endl; +} + +int main(int argc, char* argv[]) { + say_hello(); +} +``` +??? + +You declare function by first giving the return type (`void`) + +Then the name (`say_hello`) + +Then the list of zero or more parameters + +--- +# Functions + +You must declare the return type and parameter types. + +Parameters are local variables that are initialised by the caller. + +Return a value with the `return` statement - the type of the +expression must be (convertible to) the declared return type. + +```C++ +int sum(int a, int b) { + return a + b; +} +``` + +Can alse use trailing return type (aka "east end functions"): +```C++ +auto sum(int a, int b) -> int { + return a + b; +} +``` + +??? + +First sight of the `auto` keyword - this basically says to the +compiler: "figure it out from what follows" + +In this case, by us telling it later + +--- +# Functions + +To use a function - to "call" it - you give its name and then provide +the arguments in parentheses + +``` +int main () { + int x = 42; + std::cout << "Total = " << sum(x, 100) << std::endl; +} +``` + +The parameters to the function must match the (a) declaration. + +--- +# Function overloading + +You can have multiple functions with the **same +name** but **different arguments**. +```C++ +int sum(int a, int b) { + return a + b; +} +double sum(double a, double b) { + return a + b; +} +``` + +When you call `sum`, the compiler knows the types of the arguments and +will try to find the best match from all the candidates with the name. + +The compiler will also try to use any built-in or user-defined +conversion rules. + +--- +# What happens here? + +```C++ +int i1 = 1; +int i2 = 2; +double d1 = 1.0; +double d2 = 2.0; +unsigned u42 = 42; +std::string name = "Alice"; +std::string file = "data.csv"; + +std::cout << sum(i1, i2) << std::endl; +std::cout << sum(3, 72) << std::endl; +std::cout << sum(i1, u42) << std::endl; +std::cout << sum(d2, d1) << std::endl; +std::cout << sum(d2, 1e6) << std::endl; +std::cout << sum(d2, i1) << std::endl; +std::cout << sum(name, file) << std::endl; +``` + +--- +# Operators are functions + +C++ operators, for the non-fundamental types, are just functions with odd +names, e.g.: +```C++ +std::string operator+(const std::string& a, const std::string& b); +``` + +You can then use the natural syntax when manipulating these in other +code: + +```C++ +std::string user_name = "alice"; +auto data_file = user_name + ".csv"; +``` + +We'll come back to this! + +??? + +In general we'd recommend using auto quite a lot "Almost always auto" + +Why? + +Can't have an uninitialized variable + +Add types - on RHS as constructors - when you need to ensure the type +of something (is known to the reader). + + +--- +# Let's write some code + +```C++ +#include +#include + +void say_hello() { + std::cout << "Hello, world!" << std::endl; +} + +int main(int argc, char* argv[]) { + std::cout << "What is your name?" << std::endl; + auto name = std::string{}; + // Read from the terminal + std::cin >> name; + + // Have the program greet the user by name + say_hello(name); +} +``` + +??? + +What I'd like you to do is change `say_hello` to accept the name it +read from the terminal, create a new message saying "Hello, $NAME!" +and print it to standard output. + +I've shown here how to read a string from standard input. diff --git a/lectures/cpp-intro/frank_mon.jpg b/lectures/cpp-intro/frank_mon.jpg new file mode 100644 index 0000000..d6ef376 Binary files /dev/null and b/lectures/cpp-intro/frank_mon.jpg differ diff --git a/lectures/cpp-intro/hello/hello.cpp b/lectures/cpp-intro/hello/hello.cpp new file mode 100644 index 0000000..6c3459b --- /dev/null +++ b/lectures/cpp-intro/hello/hello.cpp @@ -0,0 +1,5 @@ +#include + +int main(int argc, char* argv[]) { + std::cout << "Hello, world" << std::endl; +} diff --git a/lectures/cpp-intro/index.html b/lectures/cpp-intro/index.html new file mode 100644 index 0000000..3b8bc49 --- /dev/null +++ b/lectures/cpp-intro/index.html @@ -0,0 +1,22 @@ + + + +A brief introduction to C++ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/cpp-intro/octodog.jpg b/lectures/cpp-intro/octodog.jpg new file mode 100644 index 0000000..d3e5ec4 Binary files /dev/null and b/lectures/cpp-intro/octodog.jpg differ diff --git a/lectures/cpp-intro/sak.jpg b/lectures/cpp-intro/sak.jpg new file mode 100644 index 0000000..28e9b96 Binary files /dev/null and b/lectures/cpp-intro/sak.jpg differ diff --git a/lectures/eigen/README.md b/lectures/eigen/README.md new file mode 100644 index 0000000..c6fe6dd --- /dev/null +++ b/lectures/eigen/README.md @@ -0,0 +1,178 @@ +template: titleslide +# Using Eigen +## Chris Richardson, Rupert Nash +## chris@bpi.cam.ac.uk, r.nash@epcc.ed.ac.uk +## CC-BY + +--- +# What is Eigen3 and why use it? +- C++ library for matrix arithmetic +- “Header only” implementation: no libraries to compile and install (easy) +- Provides some “missing” features needed for scientific computing in C++ +- Contains optimisations to get good performance out of ARM & Intel processors +- Easy to use interface +- Support for dense and sparse matrices, vectors and “arrays” +- Some support for ‘solvers’ (A.x = b) +- Download from https://eigen.tuxfamily.org or e.g. `apt install libeigen3-dev` +- If you know Python, it is a bit like a NumPy for C++ + +--- +# Basics + +```C++ +#include + +int main() +{ + Eigen::Matrix A; + A.setZero(); + A(9, 0) = 1.234; + std::cout << A << std::endl; + return 0; +} +``` + +This is pretty similar to `double A[10][10];` + +--- +# Dynamic size + +```C++ +int n = 64; +int m = 65; +Eigen::Matrix A(n, m); + +A.resize(20, 20); + +std::cout << “Size is ”; +std::cout << A.rows() << “ x ” << A.cols() << std::endl; +``` + +So this is more like a 2D version of `std::vector` + +--- +# Convenience aliases +```C++ +using Eigen::Matrix3d = Eigen::Matrix +using Eigen::Matrix3i = Eigen::Matrix +using Eigen::MatrixXd = Eigen::Matrix +using Eigen::VectorXd = Eigen::Matrix +using Eigen::RowVectorXd = Eigen::Matrix + +``` + +--- +# You can do Matrix arithmetic... + +```C++ +Eigen::MatrixXd A(5, 10); +Eigen::MatrixXd B(10, 2); +Eigen::VectorXd vec(10); + +Eigen::MatrixXd C = A * B; +Eigen::VectorXd w = A * vec; +``` + +Also dot and cross products for vectors, transpose, and usual scalar arithmetic `+ - * /` + +--- +# Element-wise ops with `Array`s + +For example: +```C++ +Eigen::Matrix3d A, B; +// set all values to 2.0 +A.array() = 2.0; + +// set each element of A to sin() of the same element in B +A.array() = B.array().sin(); + +Eigen::Array3d W; +W = W * A; // Error - cannot mix Array with Matrix +``` + +--- +# View other data as a `Matrix` or `Array` + +Mix and match with `std::vector` or any contiguous layout + +It is easy to “overlay” existing memory with an Eigen `Array` or `Matrix`: + +```C++ +std::vector a(1000); + +Eigen::Map> a_eigen(a.data()); + +a_eigen(10, 0) = 1.0; + +Eigen::Map a2_eigen(a.data(), 10, 100); +``` + +--- +# Efficiency: Eigen does lots of checks in debug mode + +Turn optimisation on: `-O2` etc. + +Turn off debug: `-DNDEBUG` + +--- +# Walk through example + +Solve diffusion equation + +$$\frac{\partial T}{\partial t} = k \frac{\partial^2 T}{\partial x^2}$$ + +in 1D using an explicit method + +Each timestep can be solved by using `\(T_{n+1} = A T_n\)` + +1. Create an initial vector for `T` +2. Create a dense matrix for `A` +3. Matrix multiply several times +- Convert to an implicit method: `\(A.T_{n+1} = T_n\)` +- Sparse matrices + + +Type along with me - see `exercises/eigen` + +--- +# Diffusion equation (explicit) + +Subscript means time, square brackets position + +`$$T_{n+1}[i] = T_n[i] + \frac{k \Delta t}{\Delta x^2}(T_n[i-1] - 2T_n[i] + T_n[i+1])$$` + +Our matrix equation is now: + +`$$T_{n+1} = A.T_{n}$$` + +Left-hand side is unknown (next time step) + +Let: `\(\delta = k\Delta t/ \Delta x^2\)` + +--- +# Matrix + +``` +A = [ 1-ẟ ẟ 0 0 0 … ] + [ ẟ 1-2ẟ ẟ 0 0 … ] + [ 0 ẟ 1-2ẟ ẟ 0 … ] + [ 0 0 ẟ 1-2ẟ ẟ 0 0 …] + [ 0 0 0 ẟ 1-2ẟ ẟ 0 …] + +``` + +--- +# Diffusion equation (implicit) + +Subscript means time, square brackets position + +`$$T_{n+1}[i] - \frac{k \Delta t}{\Delta x^2} (T_{n+1}[i-1] - 2*T_{n+1}[i] + T_{n+1}[i+1]) = T[i]$$` + +Left-hand side is unknown (next time step) +`$$A.T_{n+1} = T_n$$` + +Let: $$\delta = k \Delta t/\Delta x^2$$ + +??? +The matrix A is very similar - just flip the sign of the delta terms diff --git a/lectures/eigen/index.html b/lectures/eigen/index.html new file mode 100644 index 0000000..0b675fd --- /dev/null +++ b/lectures/eigen/index.html @@ -0,0 +1,30 @@ + + + +A brief introduction to C++ + + + + + + + + + + + + + + + diff --git a/lectures/frameworks-kokkos/RAJA-MD-perf.png b/lectures/frameworks-kokkos/RAJA-MD-perf.png new file mode 100644 index 0000000..c79e80c Binary files /dev/null and b/lectures/frameworks-kokkos/RAJA-MD-perf.png differ diff --git a/lectures/frameworks-kokkos/README.md b/lectures/frameworks-kokkos/README.md new file mode 100644 index 0000000..c787723 --- /dev/null +++ b/lectures/frameworks-kokkos/README.md @@ -0,0 +1,456 @@ +template: titleslide + +# C++ frameworks for portable performance +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +--- +template: titleslide +# Introduction + +--- + +# Portable performance + +Modern HPC systems usually have several levels of parallelism +- Multiple nodes + +- Each with multi-core CPUs + +- Each with vector FPUs + +Many have accelerators (GPUs) for offloading kernels + +- High FLOP/Watt + +- High bandwidth graphics memory. + +- Complexity of managing offloading and distinct memory spaces + +--- + +# Portable performance + +Challenge: expose algorithms' parallelism in a way that maps to +hardware, with: + +- optimal performance + +- intuitive expression of algorithm + +- portability across different systems (Pure CPU, GPU, Xeon Phi, + future?) + +- only one version of the code base + +Or at least close! + +--- + +# What do we need to control? +.columns[ +.col[ +##Data + +- *Where* is the data? In which memory space? + +- *How* is the data laid out? Array of Structures or Structure of Arrays? +] +.col[ +## Execution + +- *Where* is the computation to take place? + +- *How* do parts relate? `for_each`/`reduce`/etc +] +] +--- + +# Why not just use... + +- CUDA - NVIDIA only + +- OpenCL - NVIDIA, AMD, embedded GPU. CPU working, but performance not + great - quite low-level + +- OpenMP or OpenACC - Maybe eventually! Not quite flexible enough yet, + but OpenACC really only NV/AMD and OpenMP requires different + directives for CPU vs GPU + +- Your suggestions - ... + +--- + +# What about parallel languages? + +- This is an area of active research! Lots have been proposed. Few + are used outside of CS research. + +- You need to trust this has been tested and will be supported for the + life of your application. + +- You need compiler, debugger, libraries that work with it + +- You need to learn a new language. + +- Anyone who wants to maintain/extend your code needs to learn a new + language. + +- Using it on a new architecure may require convincing the compiler + team to port it for you. + +--- + +# C++ can help + +Practical lab had an example of implementing a matrix template class +with Morton order data layout: + +.center[![:scale_img 50%](mortonorder.pdf)] +--- + +# C++ can help + +We have seen how the standard template library makes many algorithms +available to us in a standard way. + +```C++ +std::for_each(data.begin(), data.end(), + [](double& elem) { + elem = DoSomething(); + } +} +``` + +These functions are implemented in standard C++ - you could write an +equivalent. + +--- +# C++ can help + +If `InputIt` is a random-access iterator (e.g. from a `std::vector`) +then you could use this to parallelise with OpenMP (v3 or greater) + +```C++ +template< class InputIt, class UnaryFunction > +void omp_for_each(InputIt first, InputIt last, + UnaryFunction f) { +#pragma omp parallel for + for(InputIt iter = first; + iter < last; ++iter) { + f(*iter); + } +} +``` + +--- + +# C++ can help + +You can use this as a drop-in replacement for `std::for_each`: +```C++ +omp_for_each(data.begin(), data.end(), + [](double& elem) { + elem = DoSomething(); + }); +``` + +But this is just one thing on one method of parallelisation - we want +more and across all levels in a node, at least. + +--- +# Various options + +Fortunately a number of groups have done much of this work for us! + +- targetDP + +- Kokkos + +- RAJA + +- Many more that target a subset of platforms: + + - Thrust: CUDA specific containers and algorithms. Easy + interoperability with CUDA + . + - Hemi: CUDA only but hides much of the complexity of writing + device code . + - C++AMP - Microsoft developed language extension allowing you to + abstract the details writing code for execution on host or + GPU. + - Many more! + +--- + +# targetDP (DP = data parallel) + +- Alan Gray's (ex-EPCC, now NVIDIA) project + +- Using C as the host language. Targets parallelism using OpenMP (CPU, + Xeon Phi) and CUDA. + +- Data elements accessed through macros to abstract layout; data + allocation and movement handled by simple API. + +- Parallelism expressed with two macros: `__targetTLP__` and + `__targetILP__` for thread and instruction level parallelism + respectively. + +- Good if you want something lightweight and are using C. + +- source in LUDWIG + + +--- +# RAJA From LLNL (one of the big US national labs) + +C++ library that uses these concepts to achieve performance portability. + +DAXPY combined with a reduction: +```C++ +double* x; double* y; +RAJA::SumReduction sum(0.0); + +RAJA::forall(begin, end, + [=] (int i) { + y[i] += a * x[i]; + sum += y[i]; + }); +``` + +--- + +#RAJA performance + +.columns[ +.col[ +- CoMD is a classical molecular dynamics code, written in C + (baseline - green) + +- The RAJA authors ported it to RAJA naively (RAJA-reference - blue) + and then optimized some parts (RAJA-schedule - yellow). + +- Better than the optimized C version! They only had to alter 2% of + lines and now it runs on GPU too. + ] + .col[ +![:scale_img 100%](RAJA-MD-perf.png) +]] + +--- + +# Kokkos + +Features quite similar to RAJA and also a product of a US national +lab: Sandia this time. + +They are much more open and have several major applications +(e.g. LAMMPS molecular dynamics code, SPARTA rarefied gas code) and +libraries (Trilinos) using Kokkos. + +They are committed to supporting use of Kokkos internally and +externally. + +They have a library of routines for BLAS/LAPACK/Graph manipulation and +a growing ecosystem. + +--- + +# Kokkos simple example 1 + +HPC's favourite simple test, saxpy (single precision `a*x + y`) + +```C++ +kokkos::parallel_for(N, + KOKKOS_LAMBDA(int i) { + y(i) = a * x(i) + y(i); + }); +``` + +This will run the lambda `N` times in the "default execution space" +you have set (by default the CPU, but could easily be the GPU) + +The `KOKKOS_LAMBDA` is to deal with CUDA requiring lambdas to have the +`__device__` attribute. + +--- + +# Kokkos simple example 2 + +Dot product of `x` and `y`: + +```C++ +float result = 0; +parallel_reduce(N, + KOKKOS_LAMBDA(int i, float& value) { + value += x(i) * y(i); + }, result); +``` + +Kokkos manages each thread's local temporary values and reduces them +in a scalable way based on the execution space. + +--- + +# Kokkos data + +Kokkos uses a lightweight "View" template class to store data. It can +be considered much like a `std::shared_ptr`. + +Rank (number of dimensions) is fixed at compile time + +Size of array can be set at compile or run time (run time must go +first). + +```C++ +// 2 run, 0 compile +View d1("label", M, N); + +// 1 run, 1 compile +View d2("label", M); + +// 0 run, 2 compile +View d3("label"); +``` + +--- + +# Kokkos data + +Allocation and copy only occur when explicitly specified + +Copy construction and assignment are shallow. + +Only when last view holding a reference destructs is the data +deallocated. + +```C++ +View a("a"), b("b"); +b = a; +{ + View c(b); + a(0) = 1; b(0) = 2; c(0) = 3; +} +std::cout << a(0); +``` + +What is printed? + +-- +3 + +--- + +# Memory spaces + +Kokkos has the concept of a **memory space**. E.g. + - main host DRAM + - GPU memory + - Xeon Phi HBM + - CUDA Unified memory + +You give this as a second template argument to the view: + +```C++ +View view(M, N); +``` + +If you don't provide one, it will use a default suitable for your +default execution space. + +--- + +# Memory layout + +Kokkos also controls memory layout with a template parameter: + +```C++ +View CStyle(M,N); +View FortranStyle(M,N); +``` + +If you don't provide one, it will use the default for the memory space +(e.g. `LayoutLeft` for `CudaSpace` and `LayoutRight` for `HostSpace`). + +You can define your own layouts. + +--- + +# Mirrors and copies + +A *mirror* is a view of a compatible array residing in a (possibly) +different memory space. + +You must explicitly request copying with `deep_copy`. +```C++ +// Create your device array +View data(N); + +// Create a host mirror of that +auto host_data = create_mirror_view(data); + +// Populate on host +host_data = ReadFromFile(); +deep_copy(data, host_data); + +// Use on device +parallel_for(...); +``` + +--- + +# Some results +.columns[ +.col[ +- RamsesGPU is an astrophysical magnetohydrodynamics code, started in 2009, parallelised with CUDA and MPI. + +- The developers have ported some parts to use Kokkos instead of CUDA. + +- Without much optimization effort they see performance only 2--5% worse than their highly-tuned CUDA +] +.col[ + And better CPU-only performance: +![:scale_img 100%](RamsesKokkos.png) + +Gray: Original code +] +] + +--- + +# Some results +.columns[ +.col[ +- LAMMPS - major open source MD code + +- Very widely used + +- Was the first major (public) code to use Kokkos + +- Most recent benchmarks from http://lammps.sandia.gov/bench.html +] +.col[ +Lennard-Jones case (Higher is better) +![:scale_img 100%](lammps_lj.png) +]] + +--- +# Some results +.columns[ +.col[ +- LAMMPS - major open source MD code + +- Very widely used + +- Was the first major (public) code to use Kokkos + +- Most recent benchmarks from http://lammps.sandia.gov/bench.html +] +.col[ +Stillinger-Weber potential (Higher is better) + +![:scale_img 100%](lammps_sw.png) + +Highly tuned vendor implementations can still win! +]] diff --git a/lectures/frameworks-kokkos/RamsesKokkos.png b/lectures/frameworks-kokkos/RamsesKokkos.png new file mode 100644 index 0000000..34a589d Binary files /dev/null and b/lectures/frameworks-kokkos/RamsesKokkos.png differ diff --git a/lectures/frameworks-kokkos/index.html b/lectures/frameworks-kokkos/index.html new file mode 100644 index 0000000..bdd2a97 --- /dev/null +++ b/lectures/frameworks-kokkos/index.html @@ -0,0 +1,22 @@ + + + +C++ for numerical computing + + + + + + + + + + + + + + diff --git a/lectures/frameworks-kokkos/lammps_lj.png b/lectures/frameworks-kokkos/lammps_lj.png new file mode 100644 index 0000000..949bdc2 Binary files /dev/null and b/lectures/frameworks-kokkos/lammps_lj.png differ diff --git a/lectures/frameworks-kokkos/lammps_sw.png b/lectures/frameworks-kokkos/lammps_sw.png new file mode 100644 index 0000000..1e59b65 Binary files /dev/null and b/lectures/frameworks-kokkos/lammps_sw.png differ diff --git a/lectures/frameworks-kokkos/mortonorder.png b/lectures/frameworks-kokkos/mortonorder.png new file mode 100644 index 0000000..5a376dc Binary files /dev/null and b/lectures/frameworks-kokkos/mortonorder.png differ diff --git a/lectures/loops-containers/README.md b/lectures/loops-containers/README.md new file mode 100644 index 0000000..48ce1e5 --- /dev/null +++ b/lectures/loops-containers/README.md @@ -0,0 +1,482 @@ +template: titleslide +# Containers, loops, and iterators +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +??? + +We've pretty much only talked about working on single things at once + +Simulation/analysis usually require us to deal with many things: many +data points, many atoms in your MD simulation, many images to +recognise objects in etc. + +--- +# Containers + +So we have some intuition about what this means + +In C++ a container: + +- Holds objects of a uniform type + +- Owns the objects that it contains + +- Provides access to its elements + +The means of access and the performance of these operations depends on +the container. + +--- +# Standard library containers + +The standard library has 13 container template classes, but we'll only touch on a few. + +- `vector` - a dynamically sized contiguous array + +- `array` - a statically sized contiguous array + +- `list`/`forward_list` - a doubly/singly linked list + +- (`unordered_`)`set` / (`unordered_`)`map` + +--- +# vector + +The size of the vector is determined at run time (and hence memory is +allocated then) + +The elements are *contiguous in memory*, so it is fast to jump to any +element by index and to iterate though them. + +```C++ +#include +#include + +void ShowData() { + std::vector data = GetData(); + for (int x: data) { + std::cout << x << "\n"; + } +} +``` + +![:thumb](Use by default - data locality often wins over algorithmic complexity) + +??? + +Poorly named, but we're stuck: `std::vector` is not a vector in either +the 3D spatial sense nor the vector space sense... + +We put the type of the elements inside the angle brackets + +Here we show a for loop that will iterate through every element of the vector + +Why does contiguous memory => fast? Because main memory is slow, chips +have caches which bring lines of memory into small high speed bits of +memory on chip. They also notice when you are running through a chunk +of memory and pre-fetch (or the compiler does this) + +--- +# vector + +Supports: + +- copy and move (if element type is `noexcept` moveable) + +- random element access by index + +- resize (and pre-reserving memory) + +- element insertion + +- element deletion + +Note that it *owns* its elements: when it reaches the end of its +lifetime, contained elements will also be destroyed. + +Be aware that resizing operations may force reallocation and copying! + +--- +# vector building + +```C++ +std::vector first_n_primes(unsigned n) { + std::vector ans; + + unsigned maybe_prime = 2; + + while (ans.size() < n) { + if (is_prime(maybe_prime)) { + ans.push_back(maybe_prime); + } + maybe_prime += 1; + } + return ans; +} +``` +??? +Default constructor creates an empty vector + +`push_back` adds a new value to the end of the vector + +This might require a re-allocation of memory and copying the contents - + +Discuss capacity. + +--- +# array + +- Contiguous in memory but the size is fixed at compile time. + +- Almost like a vector, but you can't change the size. + +- Only difference is construction - list the values inside the braces + +```C++ +#include +using GridPoint = std::array; + +auto p1 = GridPoint{1,2,3}; +std::cout << p1.size() << std::endl; +// Prints 3 +``` + +--- +# list (and forward_list) + +- Almost always implemented as a doubly (singly) linked list. + +- Elements are allocated one by one on the heap at run time. + +- Traversal requires following pointers from one end + +- Fast element insertion and deletion (**if** you don't have to look for + the element!) + +![:thumb](Use when you will be adding and removing from the ends a lot + and the contained objects are expensive to copy/move. + +Consider converting to `vector` if you have a build/access pattern.) + +--- +# set and map + +- These are associative containers implemented as sorted data + structures for rapid search (typically red-black trees). + +- `set` is just a set of keys, `map` is a set of key/value pairs + (types can differ). + +- You must have defined a comparison function for the key type. + +- You may want to use the `unordered` versions which use a hash + table (requires a hash function) + +![:thumb](Use if you either + +- have a large key space that mostly lacks value, or + +- will be looking up unpredictable values a lot or frequently + adding/removing values. + ) + +--- +# Map example + +In some structural simulation each parallel processs might own a part of the domain. + +.center[ +![:scale_img 60%](domain_decomp.png) +] + +--- +# Map example + +So after starting all the processes and reading the domain we might +have code like this: + +```C++ +auto rank2comms = std::map{}; +for (auto p = 0; p < MPI_COMM_SIZE; ++p) { + if (ShareBoundaryWithRank(p)) { + rank2comms[p] = BoundaryComm(my_rank, p); + } +} +// later +for (auto [rank, bc]: rank2comm) { + bc->SendData(local_data); +} +``` +??? + +Map takes two type parameters in the angle brackets: the key type and +value type + +What's with the square brackets? It's a structured binding similar to +python's tuple unpacking + +--- +# Guidelines + +> Each container has its traits +> That define the places where they are great +> Particularly vector +> You don't need a lecture +> Just use vector + +> Where choosing a container, remember vector is best +> Leave a comment to explain if you choose from the rest + +Credit - [Tony van Eerd](https://twitter.com/tvaneerd) @ [CppCon 2017](https://youtu.be/QTLn3goa3A8?t=332) + +--- +template: titleslide +# Loops + +--- +# Loops + +Three types: + +- `while` + +- range-based `for` loop + +- C-style `for` loop + +--- +# While + +```C++ +while (boolean expression) { + // Code +} +``` + +??? + +The expression inside the parens can be anything that is convertable to `bool` + +`int` - zero => false, anything else => true + +Consider computing some prime numbers + +--- +# While + +```C++ +std::vector first_n_primes(unsigned n) { + std::vector ans; + + unsigned maybe_prime = 2; + + while (ans.size() < n) { + if (is_prime(maybe_prime)) { + ans.push_back(maybe_prime); + } + maybe_prime += 1; + } + return ans; +} +``` + +??? + +condition can be a variable or a more complex condtion + +--- +# Range based for loop + +```C++ +#include +#include + +void ShowData(std::vector& data) { + for (int x: data) { + std::cout << x << "\n"; + } +} +``` + +??? + +Syntax + +Read it as "for x in data" + +Compare to python + +Point out we are copying each element into a local variable `x` + +--- +# Range based for loop + +```C++ +#include + +void DoubleInPlace(std::vector& data) { + for (int& x: data) { + x *= 2; + } +} +``` + +??? +We are making `x` a reference now + +Changes made through it will update the value inside the container + +--- +# Range based for loop + +What can you iterate over here? + +- Any type with `begin()` and `end()` member functions + +- All the STL containers have this + +- Any type where you have overloads of the free functions `begin` and + `end` accepting your object + +--- +# C-style for loop + +Familiar to C programmers + +Fortran programmers will know this as a `DO` loop + +```C++ +void DoubleInPlace(std::vector& data) { + for (int i = 0; i < data.size(); ++i) { + data[i] *= 2; + } +} +``` + +??? + +This is the canonical form for a counting loop, but you can implement +all sort of things + +Remind people that C counts from zero (because it's how many elements +to advance past the first one) up to but not including the end + +Note pre-increment - do not post increment without good reason + +Sometime you really need this - but should usually have a range for +loop for the simple cases. "What you say when you say nothing" + +--- +template: titleslide +# Iterators + +--- + +# What is an iterator? + +Let's think about a C-style for loop: + +```C++ +void DoubleInPlace(std::vector& data) { + for (int i = 0; i < data.size(); ++i) { + data[i] *= 2; + } +} +``` + +??? + +In this function we are mixing things up +- The way in which we iterate through `data` +- The operation that we perform on it +- The way in which we access an element + +We are also requiring that `data` supports random access (cos +`operator[]`). Not all containers do + +--- +# Example using iterators + +```C++ +void DoubleInPlace(std::vector& data) { + for (std::vector::iterator it = data.begin(); + it != data.end(); ++it) { + *it *= 2; + } +} +``` + +This is the "old fashioned C++" way + +--- +# Example using iterators + +```C++ +void DoubleInPlace(std::vector& data) { + for (auto it = data.begin(); it != data.end(); ++it) { + *it *= 2; + } +} +``` + +This uses `auto` to let the compiler deduce the type for you + +??? + +Discuss pre-increment (optimiser is not perfect) + +Discuss `operator*` + +Discuss `operator==` + +--- +# Implementing your own iterator + +To define your own iterator, you need to create a type with several +operator overloads (exactly which ones depends on the category of +iterator you need). Basics are: + +- dereference operator (`*it`) - you have to be able to get a value + (either to read or write) + +- pre-increment (`++it`) - you have to be able to go to the next +one +![:fn](Why not post-increment? Because this has to return the + value of `it` from *before* it was incremented. This usually means + a copy.) + +- assigment - you need to be able to bind it to name + +- inequality comparison (`it != end`) - you need to know when you are +done + +![:fn_show]( ) + +--- +template: titleslide +# Exercise + +--- +# Containers exercise + +In your clone of this repository, find the `containers` exercise and list +the files + +``` +$ cd APT-CPP/exercises/containers +$ ls +Makefile test.cpp vector_ex.cpp vector_ex.hpp +``` + +As before, `test.cpp` holds some basic unit tests. + +`vector_ex.cpp`/`.hpp` hold some functions that work on `std::vector` - provide the implementations + +The tests require that you implement, in a new header/implementation +pair of files, a function to add a string to a `std::map` as the key, +the value being the length of the string. + +You will want to find the documentatation for `map` on https://en.cppreference.com/ + +You can compile with `make` as before. diff --git a/lectures/loops-containers/domain_decomp.png b/lectures/loops-containers/domain_decomp.png new file mode 100644 index 0000000..0fa4156 Binary files /dev/null and b/lectures/loops-containers/domain_decomp.png differ diff --git a/lectures/loops-containers/index.html b/lectures/loops-containers/index.html new file mode 100644 index 0000000..bdd2a97 --- /dev/null +++ b/lectures/loops-containers/index.html @@ -0,0 +1,22 @@ + + + +C++ for numerical computing + + + + + + + + + + + + + + diff --git a/lectures/resources/README.md b/lectures/resources/README.md new file mode 100644 index 0000000..ea34ebf --- /dev/null +++ b/lectures/resources/README.md @@ -0,0 +1,781 @@ +template: titleslide +# Resource management + +--- +# Resources + +In a program you often need to manage things like: + +- memory + +- open files + +- GPUs + +- network sockets + +??? + +Let's talk a bit about memory first + +--- +# Memory: overview + +.columns[ +.col[ +Modern OSes given each process a flat address space + +Each byte can be accessed by its address, which is just a number. + +For most purposes this can be considered in two parts: +- stack +- heap +] +.col[ +.center[ +![:scale_img 50%](mem_layout.jpg) +] +] +] + +--- +# Memory: stack + +In C++ (and C and many other compiled languages) local variables are +stored in the **stack**. + +As your program runs, the local variables that are defined, get space +allocated by the compiler relative to the current stack frame. + +Each time you call a function, you start a new stack frame by +incrementing the stack pointer. + +Variables are implemented as offsets from this, so allocating a local +variable has no run-time cost. + +When you return from a function, the stack pointer is updated and +deallocation also has no cost + +These allocations are called *static* because they have to be prepared +at compile time + +--- +# Memory: heap + +The language and OS also make available some memory for dynamic +allocation: the *heap* + +You need to request some memory to store an object and then give it +back when you are finished + +??? + +Fortran has allocatable arrays and somewhat restricted pointers + +C programmers will be familiar with malloc and free, which is also +present in C++, but should never be used (I can't recall ever having +done so) + +--- +# Pointers in C++ + +In C++ you can get a pointer to an object using `&` (the *address of* operator) + +You can read or write the value-that-is-pointed-to with `*` (the *dereference* operator) + +```C++ +int main() { + int i = 99; + int* i_ptr = &i; + + *i_ptr += 1; + std::cout << i << std::endl; +} +``` + +--- +# Pointers vs references + +Pointers are a lot like references, but they are not guaranteed to be +initialised to a valid value! + +It is valid to assign an arbitrary value to a pointer, but actually +using it is **undefined behaviour** - typically a crash but could be +silent corruption of data. + +```C++ +int main() { + int* i_ptr = 0xdeadbeef; + + *i_ptr += 1; + std::cout << i << std::endl; +} +``` +Use pointers with great caution! + +--- +# Pointers vs references + +Pointers are a lot like references, but they are not guaranteed to be +initialised to a valid value! + +It is valid to assign an arbitrary value to a pointer, but actually +using it is **undefined behaviour** - typically a crash but could be +silent corruption of data. + +```C++ +int* make_bad_pointer() { + int i = 42; + return &i; +} +``` + +Returning a pointer to a local variable is a recipe for problems + +??? + +Because once that function returns the lifetime of the target object +has ended and accessing it is UB + +--- +# Pointers to dynamically allocated memory + +In C++ you can manually create instances of objects dynamically with +`new` and try to remember to destroy them with `delete` + +But doing so is a recipe for problems! + +??? + +Using new and delete throughout your code is generally going to cause +you problems, even with extensive use of tools like AddressSanitizer (ASan) +and Valgrind + +But C++ has a design pattern that can tame this - first another feature of the language + +--- +# Destructors + +You can also control what happens when your objects reach the end of +their lifetime. + +When this happens is deterministic: +- when a local variable goes out of scope +- when an object that contains them is destroyed +- when the programmer `delete`s them + +For a class `Name` they are declared like: + +```C++ +struct Name { + ~Name(); +}; +``` + +It's important to note that you should **never call this directly** - +the compiler will call it for you when your objects are deallocated. + +??? + +Note the tilde syntax is the logical negation of the class. Cf +annihilation operators for any physicists. + + +--- +# Resource allocation is instantiation + +A very important pattern in C++ is **RAII**: resource allocation is +instantiation. + +Also known as constructor acquires, destructor releases (CADRe). + +This odd name is trying to communicate that any resource you have +should be tied to the lifetime of an object. + +So the when the compiler destroys your object it will release the +resource (e.g. memory). + +??? + +Saying that in some philosophical sense allocating a resource is the +creation of something, which implies its destruction later. + +--- +# RAII example + +A very simple copy of `std::vector`: + +```C++ +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + my_array() = default; + explicit my_array(unsigned n) : size(n), data(new double[size]) {} + ~my_array() { + delete[] data; + } + double& operator[](unsigned i) { + return data[i]; + } +}; +``` + +??? + +This class allocates some memory to store `n` doubles when constructed + +When it reaches the end of its life the destructor returns the memory +to the OS + +It allows users to access elements (with no bounds checking) + +--- +# What happens when we compile and run? + +Add a few annotations to print in the contructor/destructor + +??? + +Open sample/arr1.cpp +Compile and run + +What happens if we copy x? + +Add `auto x_cp = x;` + +--- +# Copying + +When you value assign an object in C++ this will only be valid if +there is a *copy constructor* or *copy assignment operator* + +-- + +Copy constructor - when you create a new object as the destination: + +```C++ +my_array x{10}; // Direct initialisation +my_array y{x}; // Direct initialisation +my_array z = x; // Copy initialization +``` + +-- +Copy assignment - when you assign a new value to an existing object + +```C++ +my_array x{10}; +x = my_array{2000}; +``` + +??? + +What's the diff? + +In the last case, you have to deal with releasing any resources held +by the target object + +--- +# Implicit copy + +The compiler will automatically generate these operations for us if +all the data members of you class are copyable. + +So what went wrong with the example shown? + +-- +A pointer is just a number and so it can be copied implicitly - hence the double delete + +If we want to copy our array then we need to either: +- copy the data (aka deep copy) +- share the data and somehow keep track of when the last reference to it is destroyed (aka shallow copy) + +??? + +Deep copies are more expensive at run time but somewhat safer + +Shallow copies can be faster but harder to implement correctly and can have thread safety issues + +Do we want to copy? + +--- +# User-defined copy + +Of course, you can control how your objects are copied + + +```C++ +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + my_array() = default; + explicit my_array(unsigned n) : size(n) data(new double[size]) {} + my_array(my_array const& other) : size(other.size), data(new double[size]) { + // Copy data + } + my_array& operator=(my_array const& other) { + delete[] data; + size = other.size; + data = new double[size]; + // Copy data + return *this; + } + ~my_array() { + delete[] data; + } +}; +``` +??? + +Open arr2.cpp + +Note the signature + +--- +# Returning a value looks a lot like copying + +When a function returns a value, you might think that will copy it to the target: + +```C++ +std::vector ReadData() { + std::vector answer; + // Read it from somewhere + return answer; +} + +int main() { + auto data = ReadData(); +} +``` + +??? + +Thinking about std::vector examples we've seen and that you might have implemented + +Have previously said that you should use bare auto when you want a +copy - by that what we really mean is you want to *own* the object and +control its lifetime. + +Copying a vector of billions of elements is going to get expensive and +would be counter to C++'s zero overhead abstractions principle + +--- +# Move instead + +Since C++11, the language has supported the concept of *moving* from +objects which the compiler knows (or the programmer asserts) will not +be used any more. + +Examples are: +- temporaries (i.e. the result of a function call/constructor expression) +- automatic variables that are going out of scope +- the result of calling `std::move` on an object + +The destination object "steals" the contained resources from the +source object and sets the source to a valid but undefined state - +typically the only operations you can perform on a moved-from object +are destruction and assignment. + +--- +# Move implementation + +Going back to our simple array: +```C++ +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + // c'tors, copy assignment, d'tor + my_array(my_array&& other) noexcept : size(other.size), data(other.data) { + other.size = 0; + other.data = nullptr; + } + my_array& operator=(my_array&& other) noexcept { + std::swap(size, other.size); + std::swap(data, other.data); + } +}; +``` + +??? + +Comment on `noexcept` - this is for STL compatibility. The containers +will copy if your move operations are not noexcept. These ones cannot +throw exceptions so this is safe. + + +Look at arr3.cpp + +--- +# The Rule of Five + +This says that if you define or delete one of the following: +- copy constructor +- copy assignment operator +- move constructor +- move assignment operator +- destructor + +then you should probably do so for all five. + +??? +This can be quite a lot of work! + +--- +# The Rule of Zero + +This says that unless your class is solely deals with ownership, then +it should define none of the five special functions. + +This is really a corollary of the general software engineering +"principle of single responsibility". + +You should split your code into a resource manager with all five +functions and a user class that has none, but uses the resource +manager as one or more data members. + +??? + +If it does deal with ownership then rule of 5 applies :( + +--- +# Standard library to the rescue! + +The standard library contains some help: + +`std::unique_ptr` is a pointer that uniquely owns the object(s) it +points to. + +Pointers can be moved but *not* copied - this is achieved by the copy +constructor and copy assignment operators being `delete`d: + +``` +class unique_ptr { + unique_ptr(unique_ptr const &) = delete; + unique_ptr& operator=(unique_ptr const &) = delete; +}; +``` +??? + +Ownership means that it will destroy them when it is destroyed + +Obviously there would be a lot more code in the implementation + +The syntax is basically the same as defaulting a special function + +--- +# Using std::unique_ptr + +```C++ +#include + +class Image { + Image(std::string const& file) { + // construct by reading from file... + } +}; + +std::unique_ptr ReadImage(std::string const& filename) { + return std::make_unique(filename); +} + +int main() { + auto img_ptr = ReadImage("cats.jpg"); +} +``` + +??? + +What's going on + +Include the memory header + +Some type `Image` that has a constructor to read it from a file + +Instead of constructing it in the usual way, we pass the constructor +arguments to `make_unique` and it will be allocated on the heap for us + +We return the value and because of that, the compiler knows it is +moveable so we move into the local variable img_ptr + +When this goes out of scope and is destroyed, it will destroy the +object that is pointed-to + +--- +# Using pointers + +```C++ +class Image { + int x() const { + //return x_size; + } + int y() const { + //return y_size; + } +}; +std::unique_ptr ReadImage(std::string const& filename) { + return std::make_unique(filename); +} + +int main() { + auto img_ptr = ReadImage("cats.jpg"); + Image& img_ref = *img_ptr; + + auto area = img_ref.x() * img_ref.y(); +} +``` + +??? + +We didn't do anything with that pointer. Let's imagine it has some +member functions that return the size in pixels and we want to compute +the area + +We can dereference the pointer with `operator*` + +Returns a reference to the thing-that-is-pointed-to which we can use as normal + +--- +# Using pointers + +```C++ +class Image { + int x() const { + //return x_size; + } + int y() const { + //return y_size; + } +}; +std::unique_ptr ReadImage(std::string const& filename) { + return std::make_unique(filename); +} + +int main() { + auto img_ptr = ReadImage("cats.jpg"); + + + auto area = (*img_ptr).x() * (*img_ptr).y(); +} +``` + +??? +Don't have to bind a name but this syntax looks rather ugly + +--- +# Using pointers + +```C++ +class Image { + int x() const { + //return x_size; + } + int y() const { + //return y_size; + } +}; +std::unique_ptr ReadImage(std::string const& filename) { + return std::make_unique(filename); +} + +int main() { + auto img_ptr = ReadImage("cats.jpg"); + + + auto area = img_ptr->x() * img_ref->y(); +} +``` + +??? + +Can use the pointer member access operator -> as a much more readable shorthand + +--- +# std::shared_ptr + +Sometimes you want to be able to safely share an object between many +users. + +Enter `std::shared_ptr` + +This keeps count of how many pointers are alive: increasing the count +on copy and decreasing on destruction. + +When this reaches zero the object is destroyed. + +```C++ +int main() { + auto shared = std::make_shared(); + + auto foo = LongLivedObject{shared}; + complicated_function(foo, shared); +} +``` + +![:thumb](Prefer unique_ptr unless you really need to share) + +??? + +Why not keen on shared_ptr? + +More complex, destruction no longer deterministic + +2 other annoying problems with solutions + +--- +# Niggles with shared_ptr 1 + +Sometimes you want your class to be able to get a `shared_ptr` to +itself (e.g. to create some other object that depends on it) + +```C++ +class Widget : public std::enable_shared_from_this { +public: + std::shared_ptr self() { + return shared_from_this(); + } +}; +``` + +You must ensure that a `shared_ptr` has been made before calling +shared_from_this! + +??? + +Ensure `shared_ptr` has been made first by e.g. making constructors +private and calling them from a factory function that returns a +`shared_ptr` + +--- +# Niggles with shared_ptr 2 + +Cycles: + +```C++ +class Cowboy { + using ptr = std::shared_ptr; + std::string name; + std::shared_ptr partner; +public: + Cowboy(std::string const& n) : name(n) {} + ~Cowboy() { std::cout << "Delete " << name << std::endl; } + friend void partner_up(ptr a, ptr b) { + a->partner = b; b->partner = a; + } +}; + +int main() { + auto good = std::make_shared("Alice"); + auto bad = std::make_shared("Bob"); + //ugly + partner_up(good, bad); +} +``` +??? + +Show the code in `sample/shared.cpp` - same as above but instrumented. +Compile and run and note no call of destructor! + +The way to break cycles is to use `std::weak_ptr` which doesn't count +towards the reference count. + +To use a `weak_ptr`, you must call `lock` member function which +returns a `shared_ptr` that ensures the object lives long enough to be +used. + +--- +# Raw pointers + +Now despite having been mean about pointers, there are some valid +uses - in function interfaces + +A function can perfectly well accept a raw pointer as long as it +doesn't want to change the lifetime of the object that is pointed-to + +![:thumb](Raw pointers do not own resources - use a smart pointer for that) + +![:thumb](Raw pointers represent a single object - use a span for a +contiguous range of objects) + +??? + +C++20 has `std::span` but you can use the guideline support library +if, like most of us, not there yet + +--- +# Other resources + +For example files - compare Python to C++ +.columns[ +.col[ +```Python +with open(filename) as f: + data = f.read() + +# file is closed on exiting the block +``` +] +.col[ +```C++ +std::string data; +{ + auto f = std::fstream{filename}; + f >> data; +} // file closed by destructor + +``` +] +] +??? + +Python with statements are opt-in + +Compare to C# using statements (types must implement the `IDisposable` +interface - ugh MS Hungarian notation) + +--- +# RAII for files + +```C++ +class File { +private: + std::unique_ptr handle = nullptr; +public: + File() = default; + File(std::string const& fn, char const* mode) : + handle{std::fopen(fn.c_str(), mode)} { + } + ~File() { + if (handle) { + std::fclose(handle.get()); + } + } + // Read/write member functions +}; +int main() { + auto f = File{"data.dat", "r"}; +} +``` + +??? + +`nullptr` - special value to indicate an invalid pointer + +private constructor + factory static member function + +d'tor closes the file if it has a value + +C++ destructors technically are also opt-in - but they really are the +single best feature of the language! + +Please use them! + +Could also have a network connection, handle to a GPU command stream +etc wrapped here. + +--- +# Exercise + +Try out some of this with exercises/morton-order + diff --git a/lectures/resources/index.html b/lectures/resources/index.html new file mode 100644 index 0000000..3b8bc49 --- /dev/null +++ b/lectures/resources/index.html @@ -0,0 +1,22 @@ + + + +A brief introduction to C++ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/resources/mem_layout.jpg b/lectures/resources/mem_layout.jpg new file mode 100644 index 0000000..ef33044 Binary files /dev/null and b/lectures/resources/mem_layout.jpg differ diff --git a/lectures/resources/sample/Makefile b/lectures/resources/sample/Makefile new file mode 100644 index 0000000..9e6cd17 --- /dev/null +++ b/lectures/resources/sample/Makefile @@ -0,0 +1,14 @@ +CXXFLAGS = --std=c++17 +CC = $(CXX) + +exes = arr1 arr2 arr3 shared file +all : $(exes) + +clean : + rm -rf *.o $(exes) +arr1 : arr1.o +arr2 : arr2.o +arr3 : arr3.o +shared : shared.o +file : file.o + diff --git a/lectures/resources/sample/arr1.cpp b/lectures/resources/sample/arr1.cpp new file mode 100644 index 0000000..9a0fbe2 --- /dev/null +++ b/lectures/resources/sample/arr1.cpp @@ -0,0 +1,23 @@ +#include + +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + my_array() = default; + my_array(unsigned n) : size(n), data(new double[n]) { + std::cout << "Constructing: " << data << std::endl; + } + ~my_array() { + std::cout << "Destroying: " < + +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + my_array() = default; + my_array(unsigned n) : size(n), data(new double[n]) { + std::cout << "Constructing: " << data << std::endl; + } + my_array(my_array const& other) : size(other.size), data(new double[size]) { + std::cout << "Copy constructing: " << data << std::endl; + std::copy(other.data, other.data + size, data); + } + my_array& operator=(my_array const& other) { + std::cout << "Destroying: " < + +class my_array { + unsigned size = 0; + double* data = nullptr; +public: + my_array() = default; + my_array(unsigned n) : size(n), data(new double[n]) { + std::cout << "Constructing: " << data << std::endl; + } + my_array(my_array const& other) : size(other.size), data(new double[size]) { + std::cout << "Copy constructing: " << data << std::endl; + std::copy(other.data, other.data + size, data); + } + my_array& operator=(my_array const& other) { + std::cout << "Destroying: " < +#include +#include + +class Cowboy { + using ptr = std::shared_ptr; + std::string name; + std::weak_ptr partner; +public: + + Cowboy(std::string const& n) : name(n) { + std::cout << name << std::endl; + } + ~Cowboy() { std::cout << "Delete " << name << std::endl; } + friend void partner_up(ptr a, ptr b) { + std::cout << "BFFs: " << a->name << " & " << b->name << std::endl; + a->partner = b; b->partner = a; + } +}; + +int main() { + auto good = std::make_shared("Alice"); + auto bad = std::make_shared("Bob"); + //ugly + partner_up(good, bad); +} diff --git a/lectures/template/cpptheme.js b/lectures/template/cpptheme.js new file mode 100644 index 0000000..a181302 --- /dev/null +++ b/lectures/template/cpptheme.js @@ -0,0 +1,10 @@ + +epcc.footer_text = "© Rupert Nash, The University of Edinburgh, CC-BY"; +cpptheme = new Theme( + (str => str.substring(0, str.lastIndexOf("/")))(document.currentScript.src), + '$BASEURL/style.css', + { + thumb: function () { + return '.thumb[\n.thumbtxt[\n' + this +'\n]\n]'; + } + }); diff --git a/lectures/template/style.css b/lectures/template/style.css new file mode 100644 index 0000000..91f7c1e --- /dev/null +++ b/lectures/template/style.css @@ -0,0 +1,21 @@ +div.thumb { + padding: 0.5em; + margin: 0.25em; + border-style: solid; + border-color: rgb(0,50,95); + display: flex; + align-items: center; +} + +div.thumb::before { + content: url(thumbs_up.png); +} + +blockquote { + padding: 0.5em; + background-color: #F0F0F0; +} + +.smaller { + font-size: smaller; +} diff --git a/lectures/template/thumbs_up.png b/lectures/template/thumbs_up.png new file mode 100644 index 0000000..3fbef1e Binary files /dev/null and b/lectures/template/thumbs_up.png differ diff --git a/lectures/templates/README.md b/lectures/templates/README.md new file mode 100644 index 0000000..5daafe5 --- /dev/null +++ b/lectures/templates/README.md @@ -0,0 +1,392 @@ +template: titleslide +# Templates +## Rupert Nash +## r.nash@epcc.ed.ac.uk + +??? + +We've hinted at this already, with `std::vector`, `std::shared_ptr` etc + + +--- +# Motivation + +Templates are a method of doing *metaprogramming*: a +program that writes a program. + +An easy example: + +```C++ +int sum(int a, int b) { + return a+b; +} +double sum(double a, double b) { + return a+b; +} +``` + +What if we need this for `float`, `unsigned`, and our `Complex` class +from earlier lectures? + +Going to get boring and hard to maintain quickly! + +??? + +Recall the sum functions from the first lecture + +--- +# Template functions + +```C++ +template +T sum(T a, T b) { + return a+b; +} +``` + +We can *instantiate* this template for a particular type explicitly by +giving the type inside angle brackets: + +```C++ +std::cout << "add unsigned=" << sum(1U, 4U) << std::endl; +std::cout << "add floats=" << sum(1.0f, 4e-2f) << std::endl; +``` + +??? + +Here the compiler will replace every use of `T` with the type you +supply and only then will it compile the code + +--- +# Template functions + +```C++ +template +T sum(T a, T b) { + return a+b; +} +``` + +You can also let the compiler *deduce* what `T` is for you + +```C++ +std::cout << "add unsigned=" << sum(1U, 4U) << endl; +std::cout << "add floats=" << sum(1.0f, 4e-2f) << endl; +``` +??? + +This is called implicit template instantiation - there are a few +wrinkles that we'll talk about soon + +--- +# Template classes + +You can define a template class - i.e. a template that will produce a +class when you instantiate it. + +Let's adapt the `my_array` container to hold any type `T` + +```C++ +template +class my_array { + unsigned size = 0; + T* data = nullptr; + +public: + my_array(); + my_array(unsigned n); + // Copy / move? + ~my_array(); + unsigned size() const; + const T& operator[](unsigned i) const; + T& operator[](unsigned i); +}; +``` + +??? +Talk through the syntax of the template + +We use `T*` as the type of the stored data and `T&` as the return type +of `operator[]` + +--- +# Type aliases + +Often it is useful to be able to create a new name for a type + +C++ supports a `using` declaration for this. + +```C++ + +using MyImportantType = int; + +using iter = std::map>::iterator; +``` + +??? + +Really common when creating class templates as if you don't it may be +very hard to figure out the type parameters it was instantiated for in +from other code. + +--- +# Using declaration in class templates + +```C++ +template +class my_array { + unsigned size = 0; + T* data = nullptr; + +public: + using value_type = T; + using reference = T&; + using const_reference = T const&; + // ... + const_reference operator[](unsigned i) const; + reference operator[](unsigned i); +}; +``` + +--- +# Where to put your implementation? + +Templates are *not* executable code - they tell the compiler how to +create it. + +So the definition must be available in the translation unit of the user of your template - +i.e. typically in a header file. + +You can define the functions in place like: + +```C++ +template +class my_array { +public: + my_array() : _size(0), _data(nullptr) {} +}; +``` + +Or at the end of the header (or equivalently in another file that you +include at the end of your header) + +```C++ +template +my_array::my_array(unsigned n) : _size(n) { + _data = new T[n]; +} +``` + +??? +Point out the uglier syntax of the second form but on the other hand +the class definition shown earlier is cleaner with only the member +function declarations + +--- +# Templates and the One Definition Rule + +So I said earlier that everything used had to be defined exactly once. + +This has two exceptions: + +1. Templates + +2. Functions marked `inline` + +These can be repeated in many "translation units" (i.e. separate +invocations of the compiler) + +At link time the linker will arbitrarily pick one definition to use in +the final executable (so you need to make sure that they are all +identical). + +--- +# Templates and type deduction + +Recall: +```C++ +template +T sum(T a, T b) { + return a+b; +} +``` + +We then used this without specifying, explicitly, what type `T` was - e.g.: +```C++ +int x = 1, y = 2; +auto z = sum(x, y); +``` + +The compiler is doing *template argument deduction*. + +This means it examines the types of the expressions given as arguments +to the function and then tries to choose a `T` such that the type of +the argument and the expected type match. + +--- +# Templates and type deduction + +Important to note that the template parameter `T` and the type of +function arguments might be different (but related) + +```C++ +template +void f(T x); + +template +void ref(T& x); + +template +void const_ref(T const& x); + +template +void forwarding_ref(T&& x); + +template +void vec(std::vector x); +``` + +This will affect the deduced argument type + +--- +# Non-type template parameters + +Can also parameterise template with non-types: +- integers +- pointers +- enums +- (and in C++20, floating point types and "literal types") + +E.g.: + +```C++ +template +class Vec; + +template +class Matrix; + +template +Vec operator*(Matrix const& A, Vector const& x); +``` + +??? +The compiler will now ensure that the matrix and vector are of compatible +shapes and if you make a mistake will give an error! + +The size of the answer is correctly deduced for you + +--- +# Templates and type deduction + +Full rules are quite complex + +See Meyer's Effective Modern C++ chapter 1 - free online +https://www.safaribooksonline.com/library/view/effective-modern-c/9781491908419/ch01.html + +In short: + +![:thumb]( But usually you can ignore these and just think about: +1. Whether you want to copy the argument or not - if you don't want a +copy add a reference `&` +2. Whether you can handle a const argument - if so add a `const` qualifier +3. If you want *exactly* the type of the expression - if so add `&&` - + this is known as a forwarding reference) + +--- +# Auto + +The `auto` keyword follows the same rules as template argument +deduction + +--- +template: titleslide + +# Traits +--- + +# Type traits + +- Important C++ generic programming technique, used across the + standard library + +- The "if-then-else" of types + +- Provide a template class that has typedefs/member functions that + specify the configurable parts + +- Your generic algorithm then uses this class, specialised on the type + it is working on, to select behaviour + +- You do not have to change the source of either the algorithm or the + working type + +--- +# Trait patterns + +Typically a trait is a template `struct` that accepts one or more types as +parameter(s) and either: + +.columns[ +.col2[ +Tells you something about the type(s) by producing a value, which is +stored in the `value` static member variable, e.g. + +```C++ +if (std::is_pointer::value) { + // do something pointery +} else { + // do something else +} + +// From C++17 onwards: +static_assert(std::is_pointer_v); +``` + +] +.col2[ +Transforms the type in some way by producing a type in the member +type alias `type`. + +```C++ +using signed = + typename std::make_signed< + std::uint32_t + >::type; + +// From C++14 onwards +using signed = + std::make_signed_t; +``` +] +] + +--- + +# STL traits + +Several headers contain these: + +- The header `` has lots of information for handling + types. E.g. `std::is_pointer::value` has value of false. + +- `std::numeric_limits` gives lots of parameters for the built in + number types, such as largest and smallest values, whether they are + integer or floating types, etc. + +Other traits are used everywhere behind the scenes for efficiency. + + +--- +# Writing your own + +Ideally you can define your own as a combination of existing traits. + +If not you can investigate template specialisation, which is beyond +the scope of this course. + +--- +# Exercise + +Please try the second part of [exercises/morton-order](../../exercises/morton-order). + diff --git a/lectures/templates/index.html b/lectures/templates/index.html new file mode 100644 index 0000000..f14fe01 --- /dev/null +++ b/lectures/templates/index.html @@ -0,0 +1,22 @@ + + + +C++ Templates + + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/threads-1/README.md b/lectures/threads-1/README.md new file mode 100644 index 0000000..e1ee954 --- /dev/null +++ b/lectures/threads-1/README.md @@ -0,0 +1,264 @@ +template: titleslide +# C++ Threads - Basics + +--- +# Overview + +- Introduction + +- Creating and joining threads + +- Passing arguments to threads + +- Synchronisation + +--- +# C++11 threads + +- API for multithreaded programming built in to C++11 standard +- Similar functionality to POSIX threads + - but with a proper OO interface + - based quite heavily on Boost threads library +- Portable + - depends on C++11 support, available in most compilers today +- Threads are C++ objects + - call a constructor to create a thread +- Synchronisation + - mutex locks + - condition variables + - C++11 atomics +- Tasks + - via async/futures/promises + +--- +# Creating threads + +- Threads are objects of the `std::thread` class +- Threads are created by calling the constructor for this class +- Pass as an argument what we want the thread to execute. This can be: + - A function pointer + - A function object / functor + - A lambda expression + +- Note: you cannot copy a thread + +--- +# Joining threads + +The `join()` member function on a `std::thread` object causes the calling thread to wait for the thread object to finish executing its function/functor/lambda. + +--- +# Hello world – function pointer + +```C++ +#include +#include +#include + +void hello() { + std::cout << "Hello from thread " << std::this_thread::get_id() << std::endl; +} + +int main() { + std::vector threads; + for (int i = 0; i < 5; ++i) { + threads.push_back(std::thread(hello)); + } + + for (auto& thread: threads) { + thread.join(); + } +} +``` + +--- +# Hello world – lambda function + +```C++ +#include +#include +#include + +int main() { + std::vector threads; + + for(int i = 0; i < 5; ++i){ + threads.push_back(std::thread([] { + std::cout << "Hello from thread " << std::this_thread::get_id() << std::endl; + })); + } + + for(auto& thread : threads){ + thread.join(); + } +} +``` + +--- +# Thread IDs + +- Can call `get_id()`on a thread +- Use `std::this_thread::get_id()` to call it on the executing thread +- Returns an arbitrary identifier +- Not much use! +- If we want sequentially numbered threads, need to pass the number as an argument to the thread constructor. + +--- +# Passing arguments to threads + +Arguments to the thread function are moved or copied by value + +Passing simple arguments to threads is straightforward: +```C++ +void hello(int x, int y) { + std::cout << "Hello " << x << " " << y << std::endl; +} + +int main() { + int a = 1; + int b = 27; + std::thread mythread(hello, a, b); + mythread.join(); +} +``` + +--- +# Passing references to threads + +Need to use a reference wrapper to avoid the argument to the thread constructor making a copy +```C++ +void hello(int& x) { + x++; +} + +int main() { + int x = 9; + std::thread mythread(hello, std::ref(x)); + mythread.join(); + std::cout << "x = " << x << std::endl; // x is 10 here +} +``` + +--- +# Synchronisation + +```C++ +class Wallet +{ + int mMoney = 0; +public: + Wallet() {} + + void addMoney(int money) { + mMoney += money; + } +}; +``` + +If two threads call `addMoney()` on the same `Wallet` object, then we have a race condition. + +--- +# Mutex locks + +- Can use a mutex lock to protect updates to shared variables + - natural to declare a mutex inside the object whose data needs protecting + +```C++ +#include +class Wallet +{ + int mMoney = 0; + std::mutex mutex; +public: + Wallet() {} + + void addMoney(int money) { + mutex.lock(); + mMoney += money; + mutex.unlock(); + } +}; +``` + +--- +# Guard objects + +- Need to make sure a mutex is always unlocked +- Can be tricky in cases with complex control flow, or with exception handling. +- The `std::lock_guard` class implements the RAII (resource allocation is initialization) pattern for mutexes +- Its constructor takes as an argument a mutex, which it then locks +- Its destructor unlocks the mutex + +--- +# Guard objects + +```C++ +#include + +class Wallet +{ + int mMoney = 0; + std::mutex mutex; +public: + Wallet() {} + + void addMoney(int money) { + std::lock_guard lockGuard(mutex); + mMoney += money; + } // mutex unlocked when lockGuard goes out of scope +}; +``` + +--- +# Atomics + +- C++ provides an atomic template class `std::atomic` + +- Efficient, lock-free operations supported for specialization to basic integer, boolean and character types + +- Floating point support in C++20 standard only + +--- +# Atomics + +```C++ +#include + +class Wallet +{ + std::atomic mMoney = 0; +public: + Wallet() {} + void addMoney(int money) { + mMoney += money; //atomic increment + } + }; +``` + +--- +# Thread safe class design + +- Possible to add a mutex data member to the class and make every member function that accesses any mutable state acquire and release the mutex (use a `lock_guard`) + +- Good design, in the sense that multithreaded code can use the class without worrying about the synchronisation + +- Can result in unacceptable overheads – lots of lock/unlocks, and synchronization when it’s not needed. + +- Need to think carefully about use cases in a given application. + +--- +# Reusing this material + +.center[![CC-BY-NC-SA](https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-sa.eu.png)] + +This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. + +https://creativecommons.org/licenses/by-nc-sa/4.0/ + +.smaller[ +This means you are free to copy and redistribute the material and adapt and build on the material under the following terms: You must give appropriate credit, provide a link to the license and indicate if changes were made. If you adapt or build on the material you must distribute your work under the same license as the original. + +Acknowledge EPCC as follows: “© EPCC, The University of Edinburgh, www.epcc.ed.ac.uk” + +Note that this presentation may contain images owned by others. Please seek their permission before reusing these images. +] diff --git a/lectures/threads-1/index.html b/lectures/threads-1/index.html new file mode 100644 index 0000000..b895d67 --- /dev/null +++ b/lectures/threads-1/index.html @@ -0,0 +1,23 @@ + + + +C++ Threads 1 + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/threads-2/README.md b/lectures/threads-2/README.md new file mode 100644 index 0000000..de035a0 --- /dev/null +++ b/lectures/threads-2/README.md @@ -0,0 +1,234 @@ +template: titleslide +# C++ Threads – Further topics + +--- +# Overview + +- `std::async` and futures +- Parallel STL +- C++ threads or OpenMP? + +--- +# async and futures + +- With `std::thread` there is no direct way to get a return value from the function/lambda that a thread executes + +- Could alter function to use an output reference argument, but this is problematic + +- Can get round this with `std::async` and futures + +- Also allows exception handling to work properly if a thread throws an exception + +- async returns a future object, and calling the `get()` function on the future blocks until the value is available + +--- +# async and future + +```C++ +int add(int x, int y) { + return x+y; +} + +int main() { + int a = 1; + int b = 27; + std::future fut = std::async(add, a, b); + cout << "sum = " << fut.get() << std::endl; +} +``` + +--- +# async and future + +```C++ +int add(int x, int y) { + return x+y; +} + +int main() { + int a = 1; + int b = 27; + auto fut = std::async(add, a, b); + cout << "sum = " << fut.get() << std::endl; +} +``` + +--- +# Problems! + +- By default, the runtime can decide to not create a new thread to run + the callable, and simply execute it when `fut.get()` is called. + +- Can cause deadlock if you are not careful! + +- Can force asynchronous execution via an optional policy argument: +```C++ +auto fut = std::async(std::launch::async, add, a, b); +``` + +- Implementations can choose to use a thread pool to execute asyncs, + but most don’t can end up with way too many threads + +--- +# Parallel STL + +- Version of the STL that incorporates parallel versions of many STL algorithms + +- Part of C++17 standard, so not yet available in most standard + library implementations, but coming soon. + +- Most STL algorithms take a addition execution policy argument: + * Sequential + * Parallel + * Parallel and vectorized + * (vectorized but not parallel added in C++20) + +--- +# Example + +```C++ +std::vector v = genLargeVector(); + +// standard sequential sort +std::sort(v.begin(), v.end()); + +// explicitly sequential +sort std::sort(std::execution::seq, v.begin(), v.end()); + +// permitting parallel execution +std::sort(std::execution::par, v.begin(), v.end()); + +// permitting vectorization as well +std::sort(std::execution::par_unseq, v.begin(), v.end()); +``` + +--- +# Easy to use, but... + +- For algorithms that take a function object argument (e.g. + `for_each`), that operates on each element it is up to the + programmer to ensure that parallel/vectorized execution of this is + correct. + +- Interface gives almost no control to the programmer + * how many threads to use? + * which threads handle which elements? + * whether to use a static of dynamic assignment of work to threads? + +- Might be difficult to have any control over data affinity/locality + +- Implementations can support additional execution policies + +- Scope for adding more to the standard in the future + +--- +# C++ threads or OpenMP? + +- Since OpenMP supports C++ as a base language, what are the pros and cons of OpenMP vs C++ threads? + +- Adding OpenMP support for new C++ features takes time (several years + in practice) + +- OpenMP 4.5 only supports C++98 + +- OpenMP 5.0 supports most of C++11/14 + + * some exceptions – including C++ threads and atomics + * implementations will take some time to catch up with this + +- If you want to use the latest and greatest features in C++, then + OpenMP might not work. + +--- +# C++ threads or OpenMP? + +OpenMP has a lot of useful features that are not available in the C++ standard: + +- Thread barriers + * will likely be added in C++20 + +- Thread pools and tasks + * may be added in C++23 (depends on "executors") + +- Parallel for loops + * some functionality with `std::for_each` in Parallel STL + +- SIMD directives + * probably will work OK with C++ threads + +--- +# But I can build my own! + +- C++ threads has all the required building blocks to allow you to + implement many of these features for yourself + + * or borrow someone else’s implementation + +- Can make use of C++ functionality to make them look syntactically neat + +- May not be correct + * low level threaded programming is hard! + +- May not be well documented + * unusual to find documentation that is as good as language standards + +- Unlikely to be both portable and efficient + maximum efficiency typically requires architecture-specific coding + +--- +# What about GPUs? + +- OpenMP has support for offloading code to GPUs + + * still a bit immature, not many good implementations yet + +- C++ may support this in the future sometime?? + +- There are a number of C++ frameworks for doing this + + * Kokkos + * Raja + * SYCL + +- Various degrees of support and robustness + +--- +# More differences + +- OpenMP has some restrictions which annoy C++ programmers + + * e.g. can’t use data members of objects in private or reduction clauses + +- Support for reductions is in both, but handled differently + + * OpenMP allows you to define your own reduction operators + * C++ has `std::reduce` in the Parallel STL + +- OpenMP does not allow arbitrary forking/joining of threads + * You can argue whether this is a good thing or not in an HPC context! + +- C++ has no standard way of binding threads to sockets/cores + * need to call OS-specific functions + +--- +# C++ threads or OpenMP? +.center[ +![:scale_img 80%](omp-v-thread.png) +] + +--- +# Reusing this material + +.center[![CC-BY-NC-SA](https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-sa.eu.png)] + +This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. + +https://creativecommons.org/licenses/by-nc-sa/4.0/ + +.smaller[ +This means you are free to copy and redistribute the material and adapt and build on the material under the following terms: You must give appropriate credit, provide a link to the license and indicate if changes were made. If you adapt or build on the material you must distribute your work under the same license as the original. + +Acknowledge EPCC as follows: “© EPCC, The University of Edinburgh, www.epcc.ed.ac.uk” + +Note that this presentation may contain images owned by others. Please seek their permission before reusing these images. +] diff --git a/lectures/threads-2/index.html b/lectures/threads-2/index.html new file mode 100644 index 0000000..f149417 --- /dev/null +++ b/lectures/threads-2/index.html @@ -0,0 +1,23 @@ + + + +C++ Threads 2 + + + + + + + + + + + + + \ No newline at end of file diff --git a/lectures/threads-2/omp-v-thread.png b/lectures/threads-2/omp-v-thread.png new file mode 100644 index 0000000..6e7f481 Binary files /dev/null and b/lectures/threads-2/omp-v-thread.png differ