I have the following very simple code to test the performance of the standard curand library function. There are two kernels in the code: "init_rand" to initialize the random number generators for each threads and "generate_rand" to generate random numbers on each thread. The GPU I have is NVIDIA Tesla P100-PCIE-12GB. In the code, each thread generates 100*1024 random numbers (but does not write it into any array). By varying the number of blocks and threads per block, I timed the following performance.
- 112 blocks, 1024 threads per block, time 22ms
- 224 blocks, 512 threads per block, time 22ms
- 448 blocks, 256 threads per block, time 22ms
- 112 blocks, 512 threads per block, time 11ms
T think the first three results are reasonable as the total number of threads used are the same. However, I don't see why in the fourth row the time is reduced to half. The workload for each thread remains the same in these four tests (each thread generates 1024 * 100 numbers). But why the time is reduced when fewer numbers of thread are used? Does this have something to do with the memory?
#include<stdio.h>
#include"RNG.h"
#define BLOCK 56
#define THREAD 1024
#define SIZE (100 * 1024)
int main()
{
/*SET UP RNG*/
curandState_t *streams;
cudaMalloc((void**)&streams, sizeof(curandState_t) * THREAD * BLOCK);
init_rand<<<BLOCK, THREAD>>>(time(0), streams);
/*Performance measure*/
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
/*The kernel*/
generate_rand<<<BLOCK, THREAD>>>(streams, SIZE);
/*Performance measure*/
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, stop);
printf("Time to generate: %3.1f micro-seconds\n", elapsed_time);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(streams);
return 0;
}
#include"RNG.h"
__global__ void init_rand(unsigned int seed, curandState_t* streams)
{
int id = threadIdx.x + blockDim.x * blockIdx.x;
curand_init(seed, id, 0, &streams[id]);
}
__global__ void generate_rand(curandState_t *streams, long size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
int i = 0;
while (i < size) {
/*generate random numbers*/
gen_single_rand(streams, id);
i++;
}
}
__device__ double gen_single_rand(curandState_t *streams, int id)
{
double num;
curandState_t local_state;
local_state = streams[id];
num = curand_uniform(&local_state);
streams[id] = local_state;
return num;
}
Aucun commentaire:
Enregistrer un commentaire