I am trying to get decent performance out of MSVC for this code. My naive benchmark shows clang's executable run time is about 10% of the MSVC's runtime. GCC is between the two and is typically about 25% of MSVC. Is it possible to trick MSVC into producing better assembly? I've looked at Compiler Explorer but my experiments haven't made much difference. For background, this is the core block of the Philox4x32 random number generator.
#include <inttypes.h>
#include <time.h>
#include <stdio.h>
struct array2x32 {
uint32_t v[2];
};
struct array4x32 {
uint32_t v[4];
};
struct array4x32 round4x32(struct array4x32 ctr, struct array2x32 key) {
uint32_t hi0, hi1, lo0, lo1;
uint64_t product;
product = 0x00000000D2511F53ULL * (uint64_t)ctr.v[0];
lo0 = (uint32_t)product;
hi0 = ((uint32_t)(product>>32)) ^ ctr.v[3] ^ key.v[1];
product = 0x00000000CD9E8D57ULL * (uint64_t)ctr.v[2];
lo1 = (uint32_t)product;
hi1 = ((uint32_t)(product>>32)) ^ ctr.v[1] ^ key.v[0];
struct array4x32 out = ;
return out;
}
#define N 1000000000
int main(){
struct array4x32 ctr = 0;
struct array2x32 key = 0;
struct array4x32 out;
uint64_t count = 0, sum = 0;
int i, j;
clock_t begin = clock();
for (i = 0; i < N / 4UL; i++) {
ctr.v[0]++;
out = round4x32(ctr, key);
for (j = 0; j < 4; j++) {
sum += out.v[j];
count++;
}
}
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%0.10f", time_spent);
printf("0x%" PRIx64 "\ncount: %" PRIu64 "\n", sum, count);
printf("%" PRIu64 " randoms per second\n",
(uint64_t)((N / time_spent) / 1000000 * 1000000));
}
Aucun commentaire:
Enregistrer un commentaire