Skip to content

Instantly share code, notes, and snippets.

@folkertdev
Created April 16, 2025 14:40
Show Gist options
  • Save folkertdev/520d85934b51a1dcea85552b2f4a83fb to your computer and use it in GitHub Desktop.
Save folkertdev/520d85934b51a1dcea85552b2f4a83fb to your computer and use it in GitHub Desktop.
Profiling SIMD saturating subtract
const N: usize = 10000;
fn main() {
let mut args = std::env::args().skip(1); // skip the program name
// just get a bunch of arbitrary values cheaply
const INPUT: &[u8] = include_bytes!(
"/home/folkertdev/.cargo/registry/cache/github.com-1ecc6299db9ec823/clap-4.2.7.crate"
);
let (_, table, _) = unsafe { INPUT.align_to() };
let wsize = std::hint::black_box(42);
match args.next().as_deref() {
Some("scalar") => {
for _ in 0..N {
let mut table = table.to_vec();
scalar(&mut table, wsize)
}
}
Some("simd") => {
for _ in 0..N {
let mut table = table.to_vec();
unsafe { simd(&mut table, wsize) }
}
}
_ => unreachable!("Unexpected argument. Only 'scalar' or 'simd' are allowed."),
}
}
pub fn scalar(table: &mut [u16], wsize: u16) {
for m in table.iter_mut() {
*m = m.saturating_sub(wsize);
}
}
#[inline(always)]
fn generic_slide_hash_chain<const N: usize>(table: &mut [u16], wsize: u16) {
for chunk in table.chunks_exact_mut(N) {
for m in chunk.iter_mut() {
*m = m.saturating_sub(wsize);
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd(table: &mut [u16], wsize: u16) {
// 64 means that 4 256-bit values can be processed per iteration.
// That appear to be the optimal amount for avx2.
generic_slide_hash_chain::<64>(table, wsize);
}

using https://github.com/andrewrk/poop

> poop "target/release/playground scalar" "target/release/playground simd"
Benchmark 1 (277 runs): target/release/playground scalar
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          18.0ms ±  540us    17.4ms … 23.6ms         17 ( 6%)        0%
  peak_rss           2.14MB ± 61.5KB    2.10MB … 2.23MB          0 ( 0%)        0%
  cpu_cycles         77.4M  ± 2.15M     75.7M  …  101M          16 ( 6%)        0%
  instructions        157M  ±  292       157M  …  157M           4 ( 1%)        0%
  cache_references   41.8M  ±  862K     36.6M  … 43.4M          29 (10%)        0%
  cache_misses        251K  ± 62.7K      217K  …  710K          50 (18%)        0%
  branch_misses      14.0K  ± 28.8      13.9K  … 14.1K           6 ( 2%)        0%
Benchmark 2 (300 runs): target/release/playground simd
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          16.6ms ±  328us    16.0ms … 19.0ms         17 ( 6%)        ⚡ -  7.6% ±  0.4%
  peak_rss           2.15MB ± 63.5KB    2.10MB … 2.23MB          0 ( 0%)          +  0.3% ±  0.5%
  cpu_cycles         71.3M  ± 1.31M     70.2M  … 81.4M          17 ( 6%)        ⚡ -  8.0% ±  0.4%
  instructions       68.1M  ±  281      68.1M  … 68.1M           1 ( 0%)        ⚡ - 56.7% ±  0.0%
  cache_references   39.6M  ±  812K     31.9M  … 44.1M          22 ( 7%)        ⚡ -  5.4% ±  0.3%
  cache_misses        223K  ± 52.9K      152K  …  507K          64 (21%)        ⚡ - 11.1% ±  3.8%
  branch_misses      14.1K  ±  578      13.9K  … 24.0K           5 ( 2%)          +  0.2% ±  0.5%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment