zeux · June 9, 2023 07:38
diff --git a/alphasort.cpp b/alphasort.cpp
 /*
 The Nature paper about sorting algorithms has an "improvement" for sort3 that saves a mov.
 Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184
 
 This code is experimentally verifying that the proposed optimization is perf neutral
 (aka is not improving performance). You'll need to remove the mov from all 3 versions
 and retest; feel free to test one version at a time.
 
 Cycle count established by using 'perf stat' on Ryzen 7 5900X - it does not depend on
 whether the mov is there.

 Register layout for all three versions (using volatile registers to avoid save/restore):
   P %eax
   Q %esi
   R %ecx
   S %edx

 Note: all routines are called "sort3" but these are implementing the last 2 stages of a
 3-stage 3-element sorting network. This matches "figure 3" from the paper.
 */

 __attribute__((noinline))
 __attribute__((naked))
 void sort3(int* data)
 {
 	__asm__(
 "movl 0(%rdi), %eax\n"
 "movl 4(%rdi), %esi\n"
 "movl 8(%rdi), %ecx\n"

 "movl %ecx, %edx\n"
 "cmpl %ecx, %eax\n"

 "cmovg %eax, %ecx\n"
 "cmovl %eax, %edx\n"

 "movl %edx, %eax\n" // remove this line to discover the secret

 "cmpl %esi, %edx\n"
 "cmovg %esi, %eax\n"
 "cmovg %edx, %esi\n"

 "movl %eax, 0(%rdi)\n"
 "movl %esi, 4(%rdi)\n"
 "movl %ecx, 8(%rdi)\n"

 "ret\n"
 );
 }

 __attribute__((noinline))
 __attribute__((naked))
 void sort3loop(int* data, int count)
 {
 	__asm__(
 "movl %esi, %r8d\n"

 ".loop:\n"
 "movl 0(%rdi), %eax\n"
 "movl 4(%rdi), %esi\n"
 "movl 8(%rdi), %ecx\n"

 "movl %ecx, %edx\n"
 "cmpl %ecx, %eax\n"

 "cmovg %eax, %ecx\n"
 "cmovl %eax, %edx\n"

 "movl %edx, %eax\n" // remove this line to discover the secret

 "cmpl %esi, %edx\n"
 "cmovg %esi, %eax\n"
 "cmovg %edx, %esi\n"

 "movl %eax, 0(%rdi)\n"
 "movl %esi, 4(%rdi)\n"
 "movl %ecx, 8(%rdi)\n"

 "dec %r8d\n"
 "jnz .loop\n"

 "ret\n"
 );
 }

 __attribute__((noinline))
 __attribute__((naked))
 void sort3loopreg(int* data, int count)
 {
 	__asm__(
 "movl %esi, %r8d\n"

 ".loopreg:\n"

 "movl 0(%rdi), %eax\n"
 "movl 4(%rdi), %esi\n"
 "movl 8(%rdi), %ecx\n"

 "movl %ecx, %edx\n"
 "cmpl %ecx, %eax\n"

 "cmovg %eax, %ecx\n"
 "cmovl %eax, %edx\n"

 "movl %edx, %eax\n" // remove this line to discover the secret

 "cmpl %esi, %edx\n"
 "cmovg %esi, %eax\n"
 "cmovg %edx, %esi\n"

 "dec %r8d\n"
 "jnz .loopreg\n"

 "movl %eax, 0(%rdi)\n"
 "movl %esi, 4(%rdi)\n"
 "movl %ecx, 8(%rdi)\n"

 "ret\n"
 );
 }

 int main()
 {
 	int data[3] = { 3, 1, 2 };

 	// 6 cycles per iteration (including call overhead)
 	for (int i = 0; i < 1'000'000'000; ++i) sort3(data);

 	// 4 cycles per iteration (load/store memory)
 	sort3loop(data, 1'000'000'000);

 	// 3 cycles per iteration (sort registers to avoid inter-iteration dependencies)
 	sort3loopreg(data, 1'000'000'000);

 	return data[0] == 1 && data[1] == 2 && data[2] == 3 ? 0 : 1;
 }
	/*
	The Nature paper about sorting algorithms has an "improvement" for sort3 that saves a mov.
	Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184

	This code is experimentally verifying that the proposed optimization is perf neutral
	(aka is not improving performance). You'll need to remove the mov from all 3 versions
	and retest; feel free to test one version at a time.

	Cycle count established by using 'perf stat' on Ryzen 7 5900X - it does not depend on
	whether the mov is there.

	Register layout for all three versions (using volatile registers to avoid save/restore):
	P %eax
	Q %esi
	R %ecx
	S %edx

	Note: all routines are called "sort3" but these are implementing the last 2 stages of a
	3-stage 3-element sorting network. This matches "figure 3" from the paper.
	*/

	__attribute__((noinline))
	__attribute__((naked))
	void sort3(int* data)
	{
	__asm__(
	"movl 0(%rdi), %eax\n"
	"movl 4(%rdi), %esi\n"
	"movl 8(%rdi), %ecx\n"

	"movl %ecx, %edx\n"
	"cmpl %ecx, %eax\n"

	"cmovg %eax, %ecx\n"
	"cmovl %eax, %edx\n"

	"movl %edx, %eax\n" // remove this line to discover the secret

	"cmpl %esi, %edx\n"
	"cmovg %esi, %eax\n"
	"cmovg %edx, %esi\n"

	"movl %eax, 0(%rdi)\n"
	"movl %esi, 4(%rdi)\n"
	"movl %ecx, 8(%rdi)\n"

	"ret\n"
	);
	}

	__attribute__((noinline))
	__attribute__((naked))
	void sort3loop(int* data, int count)
	{
	__asm__(
	"movl %esi, %r8d\n"

	".loop:\n"
	"movl 0(%rdi), %eax\n"
	"movl 4(%rdi), %esi\n"
	"movl 8(%rdi), %ecx\n"

	"movl %ecx, %edx\n"
	"cmpl %ecx, %eax\n"

	"cmovg %eax, %ecx\n"
	"cmovl %eax, %edx\n"

	"movl %edx, %eax\n" // remove this line to discover the secret

	"cmpl %esi, %edx\n"
	"cmovg %esi, %eax\n"
	"cmovg %edx, %esi\n"

	"movl %eax, 0(%rdi)\n"
	"movl %esi, 4(%rdi)\n"
	"movl %ecx, 8(%rdi)\n"

	"dec %r8d\n"
	"jnz .loop\n"

	"ret\n"
	);
	}

	__attribute__((noinline))
	__attribute__((naked))
	void sort3loopreg(int* data, int count)
	{
	__asm__(
	"movl %esi, %r8d\n"

	".loopreg:\n"

	"movl 0(%rdi), %eax\n"
	"movl 4(%rdi), %esi\n"
	"movl 8(%rdi), %ecx\n"

	"movl %ecx, %edx\n"
	"cmpl %ecx, %eax\n"

	"cmovg %eax, %ecx\n"
	"cmovl %eax, %edx\n"

	"movl %edx, %eax\n" // remove this line to discover the secret

	"cmpl %esi, %edx\n"
	"cmovg %esi, %eax\n"
	"cmovg %edx, %esi\n"

	"dec %r8d\n"
	"jnz .loopreg\n"

	"movl %eax, 0(%rdi)\n"
	"movl %esi, 4(%rdi)\n"
	"movl %ecx, 8(%rdi)\n"

	"ret\n"
	);
	}

	int main()
	{
	int data[3] = { 3, 1, 2 };

	// 6 cycles per iteration (including call overhead)
	for (int i = 0; i < 1'000'000'000; ++i) sort3(data);

	// 4 cycles per iteration (load/store memory)
	sort3loop(data, 1'000'000'000);

	// 3 cycles per iteration (sort registers to avoid inter-iteration dependencies)
	sort3loopreg(data, 1'000'000'000);

	return data[0] == 1 && data[1] == 2 && data[2] == 3 ? 0 : 1;
	}