Created
August 3, 2025 01:13
-
-
Save Rexicon226/17b77a0c4121839d7bf98e38f583e4f9 to your computer and use it in GitHub Desktop.
RVV `indexOfSentinel` benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const iterations_per_byte = 1000; | |
const warmup_iterations = 10; | |
pub fn main() !void { | |
const allocator = std.heap.smp_allocator; | |
// Pin the process to a single core (1) | |
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1)); | |
try std.os.linux.sched_setaffinity(0, &cpu0001); | |
var stdout = std.fs.File.stdout().writer(&.{}); | |
const writer = &stdout.interface; | |
const loops = try std.process.argsAlloc(allocator); | |
defer std.process.argsFree(allocator, loops); | |
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10); | |
const pow_max_bytes = try std.math.powi(usize, 2, max_bytes); | |
const buffer = try allocator.alloc(u8, pow_max_bytes); | |
for (1..max_bytes) |N| { | |
const index = try std.math.powi(usize, 2, N); | |
const slice = buffer[0..index]; | |
// worst case scenario, sentinel is at the last byte, we need to scan over everythign | |
@memset(slice, 0xAA); | |
slice[index - 1] = 0; | |
try writer.print("{},", .{index}); | |
inline for (.{ | |
.{ indexOfSentinelFaster, "no mov" }, | |
.{ indexOfSentinelSlower, "with mov" }, | |
}) |impl| { | |
const func, const name = impl; | |
_ = name; | |
var i: u32 = 0; | |
var cycles: u64 = 0; | |
while (i < iterations_per_byte + warmup_iterations) : (i += 1) { | |
const start = rdtsc(); | |
std.mem.doNotOptimizeAway(func(u8, 0, @ptrCast(slice))); | |
const end = rdtsc(); | |
if (i > warmup_iterations) cycles += (end - start); | |
} | |
const cycles_per_byte = cycles / iterations_per_byte; | |
try writer.print("{d},", .{cycles_per_byte}); | |
} | |
try writer.writeAll("\n"); | |
} | |
} | |
pub fn indexOfSentinelFaster(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize { | |
const size = switch (@bitSizeOf(T)) { | |
8, 16, 32, 64 => |size| size, | |
else => @compileError("unsupported size"), | |
}; | |
return asm volatile (std.fmt.comptimePrint( | |
\\ mv a3, %[ptr] # Save start | |
\\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length | |
\\ | |
\\1: | |
\\ vle{[bit_size]}ff.v v8, (a3) # Load bytes | |
\\ li a1, %[sentinel] # Load sentinel into register | |
\\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel | |
\\ csrr a1, vl # Get bytes read | |
\\ vfirst.m a2, v0 # Find first set bit | |
\\ add a3, a3, a1 # Bump pointer | |
\\ bltz a2, 1b # Not found? | |
\\ | |
\\ add a4, %[ptr], a1 # Sum start + bump | |
\\ add a3, a3, a2 # Add index | |
\\ sub %[result], a3, a4 # Subtract start address + bump | |
, | |
.{ .bit_size = size }, | |
) | |
: [result] "=r" (-> usize), | |
: [ptr] "r" (p), | |
[sentinel] "i" (sentinel), | |
: "a1", "a2", "a3", "a4" | |
); | |
} | |
pub fn indexOfSentinelSlower(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize { | |
const size = switch (@bitSizeOf(T)) { | |
8, 16, 32, 64 => |size| size, | |
else => @compileError("unsupported size"), | |
}; | |
return asm volatile (std.fmt.comptimePrint( | |
\\ mv a3, %[ptr] # Save start | |
\\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length | |
\\ | |
\\1: | |
\\ vle{[bit_size]}ff.v v8, (a3) # Load bytes | |
\\ li a1, %[sentinel] # Load sentinel into register | |
\\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel | |
\\ csrr a1, vl # Get bytes read | |
\\ vfirst.m a2, v0 # Find first set bit | |
\\ add a3, a3, a1 # Bump pointer | |
\\ bltz a2, 1b # Not found? | |
\\ | |
\\ add a0, %[ptr], a1 # Sum start + bump | |
\\ add a3, a3, a2 # Add index | |
\\ sub %[result], a3, a0 # Subtract start address + bump | |
, | |
.{ .bit_size = size }, | |
) | |
: [result] "=r" (-> usize), | |
: [ptr] "r" (p), | |
[sentinel] "i" (sentinel), | |
); | |
} | |
fn rdtsc() usize { | |
return asm ("rdtime %[out]" | |
: [out] "=r" (-> usize), | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment