Skip to content

Instantly share code, notes, and snippets.

@Rexicon226
Created August 3, 2025 01:13
Show Gist options
  • Save Rexicon226/17b77a0c4121839d7bf98e38f583e4f9 to your computer and use it in GitHub Desktop.
Save Rexicon226/17b77a0c4121839d7bf98e38f583e4f9 to your computer and use it in GitHub Desktop.
RVV `indexOfSentinel` benchmark
const std = @import("std");
const iterations_per_byte = 1000;
const warmup_iterations = 10;
pub fn main() !void {
const allocator = std.heap.smp_allocator;
// Pin the process to a single core (1)
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
try std.os.linux.sched_setaffinity(0, &cpu0001);
var stdout = std.fs.File.stdout().writer(&.{});
const writer = &stdout.interface;
const loops = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, loops);
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);
const pow_max_bytes = try std.math.powi(usize, 2, max_bytes);
const buffer = try allocator.alloc(u8, pow_max_bytes);
for (1..max_bytes) |N| {
const index = try std.math.powi(usize, 2, N);
const slice = buffer[0..index];
// worst case scenario, sentinel is at the last byte, we need to scan over everythign
@memset(slice, 0xAA);
slice[index - 1] = 0;
try writer.print("{},", .{index});
inline for (.{
.{ indexOfSentinelFaster, "no mov" },
.{ indexOfSentinelSlower, "with mov" },
}) |impl| {
const func, const name = impl;
_ = name;
var i: u32 = 0;
var cycles: u64 = 0;
while (i < iterations_per_byte + warmup_iterations) : (i += 1) {
const start = rdtsc();
std.mem.doNotOptimizeAway(func(u8, 0, @ptrCast(slice)));
const end = rdtsc();
if (i > warmup_iterations) cycles += (end - start);
}
const cycles_per_byte = cycles / iterations_per_byte;
try writer.print("{d},", .{cycles_per_byte});
}
try writer.writeAll("\n");
}
}
pub fn indexOfSentinelFaster(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize {
const size = switch (@bitSizeOf(T)) {
8, 16, 32, 64 => |size| size,
else => @compileError("unsupported size"),
};
return asm volatile (std.fmt.comptimePrint(
\\ mv a3, %[ptr] # Save start
\\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length
\\
\\1:
\\ vle{[bit_size]}ff.v v8, (a3) # Load bytes
\\ li a1, %[sentinel] # Load sentinel into register
\\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel
\\ csrr a1, vl # Get bytes read
\\ vfirst.m a2, v0 # Find first set bit
\\ add a3, a3, a1 # Bump pointer
\\ bltz a2, 1b # Not found?
\\
\\ add a4, %[ptr], a1 # Sum start + bump
\\ add a3, a3, a2 # Add index
\\ sub %[result], a3, a4 # Subtract start address + bump
,
.{ .bit_size = size },
)
: [result] "=r" (-> usize),
: [ptr] "r" (p),
[sentinel] "i" (sentinel),
: "a1", "a2", "a3", "a4"
);
}
pub fn indexOfSentinelSlower(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize {
const size = switch (@bitSizeOf(T)) {
8, 16, 32, 64 => |size| size,
else => @compileError("unsupported size"),
};
return asm volatile (std.fmt.comptimePrint(
\\ mv a3, %[ptr] # Save start
\\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length
\\
\\1:
\\ vle{[bit_size]}ff.v v8, (a3) # Load bytes
\\ li a1, %[sentinel] # Load sentinel into register
\\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel
\\ csrr a1, vl # Get bytes read
\\ vfirst.m a2, v0 # Find first set bit
\\ add a3, a3, a1 # Bump pointer
\\ bltz a2, 1b # Not found?
\\
\\ add a0, %[ptr], a1 # Sum start + bump
\\ add a3, a3, a2 # Add index
\\ sub %[result], a3, a0 # Subtract start address + bump
,
.{ .bit_size = size },
)
: [result] "=r" (-> usize),
: [ptr] "r" (p),
[sentinel] "i" (sentinel),
);
}
fn rdtsc() usize {
return asm ("rdtime %[out]"
: [out] "=r" (-> usize),
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment