Last active
June 9, 2023 07:38
-
-
Save zeux/37f8f9dd7840ac0217ee862ba3a269fd to your computer and use it in GitHub Desktop.
On Nature paper about sorting algorithms. Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
The Nature paper about sorting algorithms has an "improvement" for sort3 that saves a mov. | |
Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184 | |
This code is experimentally verifying that the proposed optimization is perf neutral | |
(aka is not improving performance). You'll need to remove the mov from all 3 versions | |
and retest; feel free to test one version at a time. | |
Cycle count established by using 'perf stat' on Ryzen 7 5900X - it does not depend on | |
whether the mov is there. | |
Register layout for all three versions (using volatile registers to avoid save/restore): | |
P %eax | |
Q %esi | |
R %ecx | |
S %edx | |
Note: all routines are called "sort3" but these are implementing the last 2 stages of a | |
3-stage 3-element sorting network. This matches "figure 3" from the paper. | |
*/ | |
__attribute__((noinline)) | |
__attribute__((naked)) | |
void sort3(int* data) | |
{ | |
__asm__( | |
"movl 0(%rdi), %eax\n" | |
"movl 4(%rdi), %esi\n" | |
"movl 8(%rdi), %ecx\n" | |
"movl %ecx, %edx\n" | |
"cmpl %ecx, %eax\n" | |
"cmovg %eax, %ecx\n" | |
"cmovl %eax, %edx\n" | |
"movl %edx, %eax\n" // remove this line to discover the secret | |
"cmpl %esi, %edx\n" | |
"cmovg %esi, %eax\n" | |
"cmovg %edx, %esi\n" | |
"movl %eax, 0(%rdi)\n" | |
"movl %esi, 4(%rdi)\n" | |
"movl %ecx, 8(%rdi)\n" | |
"ret\n" | |
); | |
} | |
__attribute__((noinline)) | |
__attribute__((naked)) | |
void sort3loop(int* data, int count) | |
{ | |
__asm__( | |
"movl %esi, %r8d\n" | |
".loop:\n" | |
"movl 0(%rdi), %eax\n" | |
"movl 4(%rdi), %esi\n" | |
"movl 8(%rdi), %ecx\n" | |
"movl %ecx, %edx\n" | |
"cmpl %ecx, %eax\n" | |
"cmovg %eax, %ecx\n" | |
"cmovl %eax, %edx\n" | |
"movl %edx, %eax\n" // remove this line to discover the secret | |
"cmpl %esi, %edx\n" | |
"cmovg %esi, %eax\n" | |
"cmovg %edx, %esi\n" | |
"movl %eax, 0(%rdi)\n" | |
"movl %esi, 4(%rdi)\n" | |
"movl %ecx, 8(%rdi)\n" | |
"dec %r8d\n" | |
"jnz .loop\n" | |
"ret\n" | |
); | |
} | |
__attribute__((noinline)) | |
__attribute__((naked)) | |
void sort3loopreg(int* data, int count) | |
{ | |
__asm__( | |
"movl %esi, %r8d\n" | |
".loopreg:\n" | |
"movl 0(%rdi), %eax\n" | |
"movl 4(%rdi), %esi\n" | |
"movl 8(%rdi), %ecx\n" | |
"movl %ecx, %edx\n" | |
"cmpl %ecx, %eax\n" | |
"cmovg %eax, %ecx\n" | |
"cmovl %eax, %edx\n" | |
"movl %edx, %eax\n" // remove this line to discover the secret | |
"cmpl %esi, %edx\n" | |
"cmovg %esi, %eax\n" | |
"cmovg %edx, %esi\n" | |
"dec %r8d\n" | |
"jnz .loopreg\n" | |
"movl %eax, 0(%rdi)\n" | |
"movl %esi, 4(%rdi)\n" | |
"movl %ecx, 8(%rdi)\n" | |
"ret\n" | |
); | |
} | |
int main() | |
{ | |
int data[3] = { 3, 1, 2 }; | |
// 6 cycles per iteration (including call overhead) | |
for (int i = 0; i < 1'000'000'000; ++i) sort3(data); | |
// 4 cycles per iteration (load/store memory) | |
sort3loop(data, 1'000'000'000); | |
// 3 cycles per iteration (sort registers to avoid inter-iteration dependencies) | |
sort3loopreg(data, 1'000'000'000); | |
return data[0] == 1 && data[1] == 2 && data[2] == 3 ? 0 : 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment