Created
January 11, 2024 05:50
-
-
Save jakerieger/f278e41182ab57b295c7c67cc01995c8 to your computer and use it in GitHub Desktop.
Simple SIMD vs Scalar Benchmark in C
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <immintrin.h> | |
#define ARRAY_SIZE 8 | |
// Returns processor timestamp | |
// Used to benchmark direct CPU cycles | |
uint64_t rdtsc() { | |
unsigned int lo, hi; | |
__asm __volatile ("rdtsc" : "=a" (lo), "=d" (hi)); | |
return ((uint64_t)hi << 32) | lo; | |
} | |
void multiply_and_add_scalar(const float* a, const float* b, const float* c, float* d) { | |
for (int i = 0; i < ARRAY_SIZE; i++) { | |
d[i] = a[i] * b[i]; | |
d[i] = d[i] + c[i]; | |
} | |
} | |
__m256 multiply_and_add_avx2(__m256 a, __m256 b, __m256 c) { | |
return _mm256_fmadd_ps(a, b, c); | |
} | |
int main(int argc, char* argv[]) { | |
printf("== SIMD Sandbox ==\n"); | |
printf("\n"); | |
const float listA[ARRAY_SIZE] = {1.5662443198f, 1.9499608073f, 1.3247428207f, 1.8505472433f, 1.7390682659f, 1.0293262266f, 1.0932545285f, 1.6318153858f}; | |
const float listB[ARRAY_SIZE] = {1.4796034305f, 1.5185142064f, 1.4468925043f, 1.8052322854f, 1.3168688321f, 1.2408157169f, 1.9994789087f, 1.2295182167f}; | |
const float listC[ARRAY_SIZE] = {1.3171742351f, 1.0768740252f, 1.0896275207f, 1.5868856641f, 1.0973056825f, 1.1825299576f, 1.6868755141f, 1.0101495488f}; | |
float result; | |
uint64_t start = rdtsc(); | |
multiply_and_add_scalar(listA, listB, listC, &result); | |
uint64_t end = rdtsc(); | |
const uint64_t scalarCycleCount = end - start; | |
printf("Scalar Result: %f\n", (&result)[0]); | |
printf("CPU Cycle Count: %lu\n", scalarCycleCount); | |
const __m256 vListA = _mm256_set_ps(1.5662443198f, 1.9499608073f, 1.3247428207f, 1.8505472433f, 1.7390682659f, 1.0293262266f, 1.0932545285f, 1.6318153858f); | |
const __m256 vListB = _mm256_set_ps(1.4796034305f, 1.5185142064f, 1.4468925043f, 1.8052322854f, 1.3168688321f, 1.2408157169f, 1.9994789087f, 1.2295182167f); | |
const __m256 vListC = _mm256_set_ps(1.3171742351f, 1.0768740252f, 1.0896275207f, 1.5868856641f, 1.0973056825f, 1.1825299576f, 1.6868755141f, 1.0101495488f); | |
start = rdtsc(); | |
const __m256 vResult = multiply_and_add_avx2(vListA, vListB, vListC); | |
end = rdtsc(); | |
const uint64_t avx2CycleCount = end - start; | |
const float* fResult = (float*)&vResult; | |
printf("\n"); | |
printf("AVX2 Result: %f\n", fResult[7]); | |
printf("CPU Cycle Count: %lu\n", avx2CycleCount); | |
printf("\n"); | |
printf("Speed Improvement: %0.2fx", (scalarCycleCount / avx2CycleCount) * 1.f); | |
printf("\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiled with Clang 14 on Ubuntu 22.04 using the
-fma
flag