diff --git a/docs/quickstart.md b/docs/quickstart.md index 0a3f0625f..f2b12d187 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -106,7 +106,7 @@ $ make -j allgather_test_perf allreduce_test_perf For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu). ```bash -$ mpirun --bind-to-numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5 +$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5 ``` *NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.* diff --git a/include/mscclpp/fifo_device.hpp b/include/mscclpp/fifo_device.hpp index 5806ca510..35b837f47 100644 --- a/include/mscclpp/fifo_device.hpp +++ b/include/mscclpp/fifo_device.hpp @@ -25,7 +25,8 @@ struct alignas(16) ProxyTrigger { uint64_t fst, snd; }; -/// A concurrent FIFO where multiple device threads can push work elements and a single host proxy thread consumes them. +/// A concurrent FIFO where multiple device threads (the number of threads should not exceed the fifo size) can push +/// work elements and a single host proxy thread consumes them. /// /// The FIFO has a head pointer allocated on the device which starts at 0 and goes up to 2^64-1, which is almost /// infinity. There are two copies of the tail, one on the device, @ref FifoDeviceHandle::tailReplica, and another on @@ -64,9 +65,10 @@ struct FifoDeviceHandle { ProxyTrigger* triggerPtr = &(this->triggers[curFifoHead % size]); - // store with memory order release so that the while loop does not go pass this. + // There is a Write-After-Read hazard for the triggerPtr->fst. So the st instruction will not be executed + // before the loop. #if defined(MSCCLPP_DEVICE_CUDA) - asm volatile("st.global.release.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); + asm volatile("st.global.relaxed.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); #else // !defined(MSCCLPP_DEVICE_CUDA) // TODO: both atomic and clang built-ins are buggy here triggerPtr->fst = trigger.fst;