Skip to content

Instantly share code, notes, and snippets.

@jakerieger
Created January 11, 2024 05:50
Show Gist options
  • Save jakerieger/f278e41182ab57b295c7c67cc01995c8 to your computer and use it in GitHub Desktop.
Save jakerieger/f278e41182ab57b295c7c67cc01995c8 to your computer and use it in GitHub Desktop.
Simple SIMD vs Scalar Benchmark in C
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
#define ARRAY_SIZE 8
// Returns processor timestamp
// Used to benchmark direct CPU cycles
uint64_t rdtsc() {
unsigned int lo, hi;
__asm __volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
void multiply_and_add_scalar(const float* a, const float* b, const float* c, float* d) {
for (int i = 0; i < ARRAY_SIZE; i++) {
d[i] = a[i] * b[i];
d[i] = d[i] + c[i];
}
}
__m256 multiply_and_add_avx2(__m256 a, __m256 b, __m256 c) {
return _mm256_fmadd_ps(a, b, c);
}
int main(int argc, char* argv[]) {
printf("== SIMD Sandbox ==\n");
printf("\n");
const float listA[ARRAY_SIZE] = {1.5662443198f, 1.9499608073f, 1.3247428207f, 1.8505472433f, 1.7390682659f, 1.0293262266f, 1.0932545285f, 1.6318153858f};
const float listB[ARRAY_SIZE] = {1.4796034305f, 1.5185142064f, 1.4468925043f, 1.8052322854f, 1.3168688321f, 1.2408157169f, 1.9994789087f, 1.2295182167f};
const float listC[ARRAY_SIZE] = {1.3171742351f, 1.0768740252f, 1.0896275207f, 1.5868856641f, 1.0973056825f, 1.1825299576f, 1.6868755141f, 1.0101495488f};
float result;
uint64_t start = rdtsc();
multiply_and_add_scalar(listA, listB, listC, &result);
uint64_t end = rdtsc();
const uint64_t scalarCycleCount = end - start;
printf("Scalar Result: %f\n", (&result)[0]);
printf("CPU Cycle Count: %lu\n", scalarCycleCount);
const __m256 vListA = _mm256_set_ps(1.5662443198f, 1.9499608073f, 1.3247428207f, 1.8505472433f, 1.7390682659f, 1.0293262266f, 1.0932545285f, 1.6318153858f);
const __m256 vListB = _mm256_set_ps(1.4796034305f, 1.5185142064f, 1.4468925043f, 1.8052322854f, 1.3168688321f, 1.2408157169f, 1.9994789087f, 1.2295182167f);
const __m256 vListC = _mm256_set_ps(1.3171742351f, 1.0768740252f, 1.0896275207f, 1.5868856641f, 1.0973056825f, 1.1825299576f, 1.6868755141f, 1.0101495488f);
start = rdtsc();
const __m256 vResult = multiply_and_add_avx2(vListA, vListB, vListC);
end = rdtsc();
const uint64_t avx2CycleCount = end - start;
const float* fResult = (float*)&vResult;
printf("\n");
printf("AVX2 Result: %f\n", fResult[7]);
printf("CPU Cycle Count: %lu\n", avx2CycleCount);
printf("\n");
printf("Speed Improvement: %0.2fx", (scalarCycleCount / avx2CycleCount) * 1.f);
printf("\n");
return 0;
}
@jakerieger
Copy link
Author

Compiled with Clang 14 on Ubuntu 22.04 using the -fma flag

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment