Last active
July 12, 2023 00:17
-
-
Save pervognsen/371ceb96af7edcd6b9b9c61e793362b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy, | |
// and that pc, ins, next_ins, next_opcase are always in registers. | |
#define a ((ins >> 8) & 0xFF) | |
#define b ((ins >> 16) & 0xFF) | |
#define c ((ins >> 24) & 0xFF) | |
// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light | |
// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated | |
// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts | |
// at the right opcode case. To prepare for the next opcode after that, the code in NEXT has to fetch the | |
// instruction at pc + 4 and then look up the opcode case pointer for the opcode in the fetched instruction. | |
// Assuming all the loads hit L1, on Zen 3 the first load would be 4 cycles, then the dependent load (with a | |
// complex addressing mode) from the opcode table is 5 cycles, plus the 1 cycle latency from the & 0xFF, so | |
// it's going to be at least 9 cycles before the control dependency for the next opcode dispatch is ready. | |
// So if that next opcode dispatch will also mispredict, you won't know until 9 cycles later than you could | |
// have known if you held the indirect jump target immediately after the restart. So, you can think of the | |
// latency of the control dependency chain as being added to the mispredict penalty. Depending on your CPU, | |
// the usual number quoted is 15-20 cycles, and 9 cycles is 50% or more of that. On Skylake where the mispredict | |
// penalty is 15-16 and L1 loads are slower, it's closer to 100%. | |
#define NEXT \ | |
do { \ | |
pc += 4; \ | |
ins = load_ins(pc); \ | |
goto *opcases[ins & 0xFF]; \ | |
} while (0) | |
case_ADD: | |
regs[a] = regs[b] + regs[c]; | |
NEXT; | |
case_JEQ: | |
pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4; | |
ins = load_ins(pc); | |
goto *opcases[ins & 0xFF]; | |
// Version 2: Speculative instruction/opcase prefetch. As soon as we enter an opcode case, we know the indirect jump target | |
// for the sequentially next PC, which is almost always the next PC (except for control flow opcodes where the branch is taken). | |
// This means the effective penalty for a back-to-back opcode dispatch mispredict goes down by the 9+ cycles quoted in the | |
// last comment block. One way to think about what's going on here is that if a dispatch mispredict happens, you have 15-20 | |
// latency cycles of work you can have in flight "for free" which will be ready after the post-mispredict restart with 0 latency. | |
// Obviously "for free" has to be heavily qualified. In stop-and-go mispredict-heavy code like this example, your pipeline | |
// resources are underutilized (in an absolute and throughput vs goodput sense) and "stealing" from a predicted-but-maybe-wasted | |
// code path can be a good trade-off. E.g. if you have a hard-to-predict two-way branch where branch prediction is effectively | |
// useless, you can get started on work for _both_ branches in parallel so that whatever branch ends up being the right one, you | |
// will have a headstart on the initial wave, especially the latency-sensitive parts. | |
#define NEXT \ | |
do { \ | |
pc += 4; \ | |
ins = next_ins; \ | |
opcase = next_opcase; \ | |
next_ins = load_ins(pc + 4); \ | |
next_opcase = opcases[next_ins & 0xFF]; \ | |
goto *opcase; \ | |
} while (0) | |
case_ADD: | |
regs[a] = regs[b] + regs[c]; | |
NEXT; | |
// Variant 1: Branchless, doesn't benefit from prefetched ins/opcase. | |
case_JEQ: | |
pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4; | |
ins = load_ins(pc); | |
opcase = opcases[ins & 0xFF]; | |
next_ins = load_ins(pc + 4); | |
next_opcase = ops[next_ins & 0xFF]; | |
goto *opcase; | |
// Variant 2: Branch-based, predict JEQ not taken, benefits from prefetched ins/opcase when not taken. | |
case_JEQ: { | |
// Save 9+ cycles (assuming the next 'goto *opcase' mispredicts) when the 'unlikely' path is taken. | |
bool taken = regs[b] == regs[c]; | |
uint32_t taken_pc = pc + (int8_t)a; | |
uint32_t taken_ins = load_ins(taken_pc); | |
void *taken_opcase = opcases[taken_ins & 0xFF]; | |
if (unlikely(taken)) { | |
pc = taken_pc; | |
ins = taken_ins; | |
opcase = taken_opcase; | |
} else { | |
pc += 4; | |
ins = next_ins; | |
opcase = next_opcase; | |
} | |
next_ins = load_ins(pc + 4); | |
next_opcase = opcases[next_ins & 0xFF]; | |
goto *opcase; | |
} | |
// Version 3: Let's go nuts and add a fully associative, unbounded branch target buffer. This means taken branches | |
// will be on the hot path as long as their branch targets are consistent. This is mostly just a fun example. | |
// In order to avoid adding an extra load's worth of latency to the prefetch path, we store the predicted | |
// opcase directly in the BTB alongside the predicted PC. Note that next_opcase's latency is now that of 1 load | |
// instead of 2 loads although the BTB load will be colder than the shared opcase table load if the PC is cold. | |
// You could also cache the instruction in the BTB to remove the dependent instruction load, but remember we | |
// are focused mainly on reducing opcase latency. I'm not sure how practical this all is, but at least it's | |
// interesting to contemplate. Caching the predicted opcase in the BTB feels like it belongs somewhere in the | |
// family tree of threaded code techniques. A serious implementation of this technique would want to use | |
// something less naive than this hysteresis-free "last taken" predictor, but it's the simplest thing you can | |
// do beyond statically predicting pc + 4, so it's the first thing students are asked to implement when learning | |
// how to design and implement a pipelined CPU in a computer architecture course. | |
#define NEXT \ | |
do { \ | |
if (likely(next_pc == pc + 4)) { \ | |
pc = next_pc; \ | |
ins = next_ins; \ | |
opcase = next_opcase; \ | |
} else { \ | |
uint32_t prev_pc = pc; \ | |
pc += 4; \ | |
ins = load_ins(pc); \ | |
opcase = opcases[ins & 0xFF]; \ | |
btb_pc[prev_pc] = pc; \ | |
btb_opcase[prev_pc] = opcase; \ | |
} \ | |
next_pc = btb_pc[pc]; \ | |
next_ins = load_ins(next_pc); \ | |
next_opcase = btb_opcase[pc]; \ | |
goto *opcase; \ | |
} while (0) | |
case_ADD: | |
// Same as before | |
case_JEQ: { | |
bool taken = regs[b] == regs[c]; | |
uint32_t taken_pc = pc + (int8_t)a; | |
uint32_t actual_next_pc = select(taken, taken_pc, pc + 4); | |
// The condition is written like this to prevent short-circuiting branches. | |
if (likely(next_pc == actual_next_pc)) { | |
pc = next_pc; | |
ins = next_ins; | |
opcase = next_opcase; | |
} else { | |
uint32_t prev_pc = pc; | |
// A branchless fallback path (variant 1) seems reasonable but branch-based (variant 2) would work too. | |
pc = actual_next_pc; | |
ins = load_ins(pc); | |
opcase = opcases[ins & 0xFF]; | |
btb_pc[prev_pc] = pc; | |
btb_opcase[prev_pc] = opcase; | |
} | |
next_pc = btb_pc[pc]; | |
next_ins = load_ins(next_pc); | |
next_opcase = btb_opcase[pc]; | |
goto *opcase; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment