About thrust API's performance #314

Huntersdeng · 2024-12-30T12:14:24Z

ENV:
cpu: Intel® Core™ i7-14700K
ubuntu 22.04
GPU: NVIDIA GeForce RTX 4070 SUPER
CUDA 12.3

I'm currently learning to use thrust API. However, in my test, the thrust API's performance is quite low. Here's my timing code of 0_Introduction/c++11_cuda.cu

#include <cuda_profiler_api.h>
#include <helper_cuda.h>
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>

#include <iostream>

/////////////////////////////////////////////////////////////////
// Some utility code to define grid_stride_range
// Normally this would be in a header but it's here
// for didactic purposes. Uses
#include "range.hpp"
using namespace util::lang;

// type alias to simplify typing...
template <typename T>
using step_range = typename range_proxy<T>::step_range_proxy;

template <typename T>
__device__ step_range<T> grid_stride_range(T begin, T end) {
  begin += blockDim.x * blockIdx.x + threadIdx.x;
  return range(begin, end).step(gridDim.x * blockDim.x);
}
/////////////////////////////////////////////////////////////////

template <typename T, typename Predicate>
__device__ void count_if(int *count, T *data, int n, Predicate p) {
  for (auto i : grid_stride_range(0, n)) {
    if (p(data[i])) atomicAdd(count, 1);
  }
}

// Use count_if with a lambda function that searches for x, y, z or w
// Note the use of range-based for loop and initializer_list inside the functor
// We use auto so we don't have to know the type of the functor or array
__global__ void xyzw_frequency(int *count, char *text, int n) {
  const char letters[]{'x', 'y', 'z', 'w'};

  count_if(count, text, n, [&](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
}

__global__ void xyzw_frequency_thrust_device(int *count, char *text, int n) {
  const char letters[]{'x', 'y', 'z', 'w'};
  *count = thrust::count_if(thrust::device, text, text + n, [=](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
}

// a bug in Thrust 1.8 causes warnings when this is uncommented
// so commented out by default -- fixed in Thrust master branch
#if 0 
void xyzw_frequency_thrust_host(int *count, char *text, int n)
{
  const char letters[] { 'x','y','z','w' };
  *count = thrust::count_if(thrust::host, text, text+n, [&](char c) {
    for (const auto x : letters) 
      if (c == x) return true;
    return false;
  });
}
#endif

int main(int argc, char **argv) {
  const char *filename = sdkFindFilePath("warandpeace.txt", argv[0]);

  int numBytes = 16 * 1048576;
  char *h_text = (char *)malloc(numBytes);

  // find first CUDA device
  int devID = findCudaDevice(argc, (const char **)argv);

  char *d_text;
  checkCudaErrors(cudaMalloc((void **)&d_text, numBytes));

  FILE *fp = fopen(filename, "r");
  if (fp == NULL) {
    printf("Cannot find the input text file\n. Exiting..\n");
    return EXIT_FAILURE;
  }
  int len = (int)fread(h_text, sizeof(char), numBytes, fp);
  fclose(fp);
  std::cout << "Read " << len << " byte corpus from " << filename << std::endl;

  checkCudaErrors(cudaMemcpy(d_text, h_text, len, cudaMemcpyHostToDevice));

  int count = 0;
  int *d_count;
  checkCudaErrors(cudaMalloc(&d_count, sizeof(int)));
  checkCudaErrors(cudaMemset(d_count, 0, sizeof(int)));

  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));

  checkCudaErrors(cudaDeviceSynchronize());

  // Try uncommenting one kernel call at a time
  float gpu_time = 0.0f;
  cudaEventRecord(start, 0);
  xyzw_frequency<<<8, 256>>>(d_count, d_text, len);
  checkCudaErrors(cudaDeviceSynchronize());
  cudaEventRecord(stop, 0);
  checkCudaErrors(cudaEventSynchronize(start));
  checkCudaErrors(cudaEventSynchronize(stop));
  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
  std::cout << "xyzw_frequency kernel time: " << gpu_time << " ms" << std::endl;
  checkCudaErrors(
      cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));
  std::cout << "counted " << count
            << " instances of 'x', 'y', 'z', or 'w' in \"" << filename << "\""
            << std::endl;

  checkCudaErrors(cudaDeviceSynchronize());
  float thrust_time = 0.0f;
  cudaEventRecord(start, 0);
  xyzw_frequency_thrust_device<<<1, 1>>>(d_count, d_text, len);
  checkCudaErrors(cudaDeviceSynchronize());
  cudaEventRecord(stop, 0);
  checkCudaErrors(cudaEventSynchronize(start));
  checkCudaErrors(cudaEventSynchronize(stop));
  checkCudaErrors(cudaEventElapsedTime(&thrust_time, start, stop));
  std::cout << "xyzw_frequency_thrust_device kernel time: " << thrust_time
            << " ms" << std::endl;

  checkCudaErrors(
      cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));

  // xyzw_frequency_thrust_host(&count, h_text, len);

  std::cout << "counted " << count
            << " instances of 'x', 'y', 'z', or 'w' in \"" << filename << "\""
            << std::endl;

  checkCudaErrors(cudaFree(d_count));
  checkCudaErrors(cudaFree(d_text));

  return EXIT_SUCCESS;
}

Are there any mistakes in my timing code?

The text was updated successfully, but these errors were encountered:

Huntersdeng · 2024-12-31T02:08:30Z

Here's my test result.

GPU Device 0: "Ada" with compute capability 8.9

Read 3223503 byte corpus from ../Samples/0_Introduction/c++11_cuda/warandpeace.txt
xyzw_frequency kernel time: 0.809984 ms
counted 107310 instances of 'x', 'y', 'z', or 'w' in "../Samples/0_Introduction/c++11_cuda/warandpeace.txt"
xyzw_frequency_thrust_device kernel time: 57.5926 ms
counted 107310 instances of 'x', 'y', 'z', or 'w' in "../Samples/0_Introduction/c++11_cuda/warandpeace.txt"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

About thrust API's performance #314

About thrust API's performance #314

Huntersdeng commented Dec 30, 2024

Huntersdeng commented Dec 31, 2024

About thrust API's performance #314

About thrust API's performance #314

Comments

Huntersdeng commented Dec 30, 2024

Huntersdeng commented Dec 31, 2024