mardi 30 janvier 2018

how to interpret cuda performance

I have the following very simple code to test the performance of the standard curand library function. There are two kernels in the code: "init_rand" to initialize the random number generators for each threads and "generate_rand" to generate random numbers on each thread. The GPU I have is NVIDIA Tesla P100-PCIE-12GB. In the code, each thread generates 100*1024 random numbers (but does not write it into any array). By varying the number of blocks and threads per block, I timed the following performance.

  • 112 blocks, 1024 threads per block, time 22ms
  • 224 blocks, 512 threads per block, time 22ms
  • 448 blocks, 256 threads per block, time 22ms
  • 112 blocks, 512 threads per block, time 11ms

T think the first three results are reasonable as the total number of threads used are the same. However, I don't see why in the fourth row the time is reduced to half. The workload for each thread remains the same in these four tests (each thread generates 1024 * 100 numbers). But why the time is reduced when fewer numbers of thread are used? Does this have something to do with the memory?


#include<stdio.h>
#include"RNG.h"

#define BLOCK 56
#define THREAD 1024
#define SIZE (100 * 1024)

int main()
{
    /*SET UP RNG*/
    curandState_t *streams;
    cudaMalloc((void**)&streams, sizeof(curandState_t) * THREAD * BLOCK);
    init_rand<<<BLOCK, THREAD>>>(time(0), streams);

    /*Performance measure*/
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    /*The kernel*/
    generate_rand<<<BLOCK, THREAD>>>(streams, SIZE);

    /*Performance measure*/
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsed_time;
    cudaEventElapsedTime(&elapsed_time, start, stop);
    printf("Time to generate: %3.1f micro-seconds\n", elapsed_time);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaFree(streams);
    return 0;
}

#include"RNG.h"
__global__ void init_rand(unsigned int seed, curandState_t* streams)
{
    int id = threadIdx.x + blockDim.x * blockIdx.x;
    curand_init(seed, id, 0, &streams[id]);
}

__global__ void generate_rand(curandState_t *streams, long size)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    int i = 0;
    while (i < size) {
            /*generate random numbers*/
            gen_single_rand(streams, id);
            i++;
    }
}

__device__ double gen_single_rand(curandState_t *streams, int id)
{
    double num;
    curandState_t local_state;
    local_state = streams[id];
    num = curand_uniform(&local_state);
    streams[id] = local_state;
    return num;
}




Aucun commentaire:

Enregistrer un commentaire