I have parallelized a nested loop using OpenMP and the parallel version is much slower. Is there a way to do it faster? This problem seems to be caused by the use of rand() functions. When they are removed, the parallel version works faster.
When removing the lines including rand() functions, the parallelization works as expected.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#define NTHREADS 28
#define tolerancia 0.0000000000001
#define _pi 3.14159
/*******************************************************/
/********************* Main Program ********************/
/*******************************************************/
int main(){
// Definition of read variables
double DX, DT, x1, x2;;
int NCELL, i, j, k, sourceTerm, Nc, n_source;
double amp_source, a_source, b_source, suma_source, L_source, x1_source, x2_source;
double *xcell,*source;
omp_set_num_threads(NTHREADS);
// Inicialization
NCELL = 8000;
DX = 0.0001;
sourceTerm = 1;
DT = 0.5*DX/1.0;
x1 = 0.0;
x2 = 0.0;
xcell = (double*)malloc(NCELL * sizeof(xcell));
source = (double*)malloc(NCELL * sizeof(xcell));
for(i=0;i<NCELL;i++){
xcell[i] = -1 + DX/2*(2*i+1);
}
srand(1234);
Nc = 40;
amp_source = 0.04;
L_source = 2.0;
a_source = 0.0;
b_source = 0.0;
x1_source = 0.0;
x2_source = 0.0;
suma_source = 0.0;
for (j=1;j<100;j++){
#pragma omp parallel for default(none) private(x1,x1_source,x2,x2_source,suma_source,n_source,a_source,b_source) shared(xcell,DX,DT,amp_source,source,Nc,NCELL,L_source)
for(i=0;i<NCELL;i++){
x1 = xcell[i] + DX/2.0;
x1_source = 2*_pi*x1/L_source;
x2 = xcell[i] - DX/2.0;
x2_source = 2*_pi*x2/L_source;
suma_source = 0.0;
for(n_source=1;n_source<=Nc;n_source++){
a_source = rand()*(1.0/RAND_MAX); //if this and the following line are commented, the parallel version works as expected
b_source = rand()*(1.0/RAND_MAX);
suma_source = suma_source + pow(-2.0*log(a_source + tolerancia) + 3.0*tolerancia,0.5)*cos(2.0*_pi*b_source)/pow(_pi*n_source,1.5)*(sin(n_source*x1_source) - sin(n_source*x2_source));
}
source[i] = amp_source/pow(DT,0.5)*suma_source*L_source;
}
}
}
The code above can be saved as example.c and compiled as: gcc -fopenmp example.c -lm -o example
I would be looking forward to knowing if it is possible to accelerate this piece of code by replacing those rand() functions or making an alternative parallelization.
Aucun commentaire:
Enregistrer un commentaire