pervognsen · July 12, 2023 00:17
diff --git a/prefetch_and_hoist.c b/prefetch_and_hoist.c
 // Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
 // and that pc, ins, next_ins, next_opcase are always in registers.

 #define a ((ins >> 8) & 0xFF)
 #define b ((ins >> 16) & 0xFF)
 #define c ((ins >> 24) & 0xFF)

 // Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
 // the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
 // by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts
 // at the right opcode case. To prepare for the next opcode after that, the code in NEXT has to fetch the
 // instruction at pc + 4 and then look up the opcode case pointer for the opcode in the fetched instruction.
 // Assuming all the loads hit L1, on Zen 3 the first load would be 4 cycles, then the dependent load (with a
 // complex addressing mode) from the opcode table is 5 cycles, plus the 1 cycle latency from the & 0xFF, so
 // it's going to be at least 9 cycles before the control dependency for the next opcode dispatch is ready.
 // So if that next opcode dispatch will also mispredict, you won't know until 9 cycles later than you could
 // have known if you held the indirect jump target immediately after the restart. So, you can think of the
 // latency of the control dependency chain as being added to the mispredict penalty. Depending on your CPU,
 // the usual number quoted is 15-20 cycles, and 9 cycles is 50% or more of that. On Skylake where the mispredict
 // penalty is 15-16 and L1 loads are slower, it's closer to 100%.

 #define NEXT \
    do { \
        pc += 4; \
        ins = load_ins(pc); \
        goto *opcases[ins & 0xFF]; \
    } while (0)

 case_ADD:
    regs[a] = regs[b] + regs[c];
    NEXT;

 case_JEQ:
    pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
    ins = load_ins(pc);
    goto *opcases[ins & 0xFF];

 // Version 2: Speculative instruction/opcase prefetch. As soon as we enter an opcode case, we know the indirect jump target
 // for the sequentially next PC, which is almost always the next PC (except for control flow opcodes where the branch is taken).
 // This means the effective penalty for a back-to-back opcode dispatch mispredict goes down by the 9+ cycles quoted in the
 // last comment block. One way to think about what's going on here is that if a dispatch mispredict happens, you have 15-20
 // latency cycles of work you can have in flight "for free" which will be ready after the post-mispredict restart with 0 latency.
 // Obviously "for free" has to be heavily qualified. In stop-and-go mispredict-heavy code like this example, your pipeline
 // resources are underutilized (in an absolute and throughput vs goodput sense) and "stealing" from a predicted-but-maybe-wasted
 // code path can be a good trade-off. E.g. if you have a hard-to-predict two-way branch where branch prediction is effectively
 // useless, you can get started on work for _both_ branches in parallel so that whatever branch ends up being the right one, you
 // will have a headstart on the initial wave, especially the latency-sensitive parts.

 #define NEXT \
    do { \
        pc += 4; \
        ins = next_ins; \
        opcase = next_opcase; \
        next_ins = load_ins(pc + 4); \
        next_opcase = opcases[next_ins & 0xFF]; \
        goto *opcase; \
    } while (0)

 case_ADD:
    regs[a] = regs[b] + regs[c];
    NEXT;

 // Variant 1: Branchless, doesn't benefit from prefetched ins/opcase.
 case_JEQ:
    pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
    ins = load_ins(pc);
    opcase = opcases[ins & 0xFF];
    next_ins = load_ins(pc + 4);
    next_opcase = ops[next_ins & 0xFF];
    goto *opcase;

 // Variant 2: Branch-based, predict JEQ not taken, benefits from prefetched ins/opcase when not taken.
 case_JEQ: {
    // Save 9+ cycles (assuming the next 'goto *opcase' mispredicts) when the 'unlikely' path is taken.
    bool taken = regs[b] == regs[c];
    uint32_t taken_pc = pc + (int8_t)a;
    uint32_t taken_ins = load_ins(taken_pc);
    void *taken_opcase = opcases[taken_ins & 0xFF];
    if (unlikely(taken)) {
        pc = taken_pc;
        ins = taken_ins;
        opcase = taken_opcase;
    } else {
        pc += 4;
        ins = next_ins;
        opcase = next_opcase;
    }
    next_ins = load_ins(pc + 4);
    next_opcase = opcases[next_ins & 0xFF];
    goto *opcase;
 }

 // Version 3: Let's go nuts and add a fully associative, unbounded branch target buffer. This means taken branches
 // will be on the hot path as long as their branch targets are consistent. This is mostly just a fun example.
 // In order to avoid adding an extra load's worth of latency to the prefetch path, we store the predicted
 // opcase directly in the BTB alongside the predicted PC. Note that next_opcase's latency is now that of 1 load
 // instead of 2 loads although the BTB load will be colder than the shared opcase table load if the PC is cold.
 // You could also cache the instruction in the BTB to remove the dependent instruction load, but remember we
 // are focused mainly on reducing opcase latency. I'm not sure how practical this all is, but at least it's
 // interesting to contemplate. Caching the predicted opcase in the BTB feels like it belongs somewhere in the
 // family tree of threaded code techniques. A serious implementation of this technique would want to use
 // something less naive than this hysteresis-free "last taken" predictor, but it's the simplest thing you can
 // do beyond statically predicting pc + 4, so it's the first thing students are asked to implement when learning
 // how to design and implement a pipelined CPU in a computer architecture course.

 #define NEXT \
    do { \
        if (likely(next_pc == pc + 4)) { \
            pc = next_pc; \
            ins = next_ins; \
            opcase = next_opcase; \
        } else { \
            uint32_t prev_pc = pc; \
            pc += 4; \
            ins = load_ins(pc); \
            opcase = opcases[ins & 0xFF]; \
            btb_pc[prev_pc] = pc; \
            btb_opcase[prev_pc] = opcase; \
        } \
        next_pc = btb_pc[pc]; \
        next_ins = load_ins(next_pc); \
        next_opcase = btb_opcase[pc]; \
        goto *opcase; \
    } while (0)

 case_ADD: 
    // Same as before

 case_JEQ: {
    bool taken = regs[b] == regs[c];
    uint32_t taken_pc = pc + (int8_t)a;
    uint32_t actual_next_pc = select(taken, taken_pc, pc + 4);
    // The condition is written like this to prevent short-circuiting branches.
    if (likely(next_pc == actual_next_pc)) {
        pc = next_pc;
        ins = next_ins;
        opcase = next_opcase;
    } else {
        uint32_t prev_pc = pc;
        // A branchless fallback path (variant 1) seems reasonable but branch-based (variant 2) would work too.
        pc = actual_next_pc;
        ins = load_ins(pc);
        opcase = opcases[ins & 0xFF];
        btb_pc[prev_pc] = pc;
        btb_opcase[prev_pc] = opcase;
    }
    next_pc = btb_pc[pc];
    next_ins = load_ins(next_pc);
    next_opcase = btb_opcase[pc];
    goto *opcase;
 }
	// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
	// and that pc, ins, next_ins, next_opcase are always in registers.

	#define a ((ins >> 8) & 0xFF)
	#define b ((ins >> 16) & 0xFF)
	#define c ((ins >> 24) & 0xFF)

	// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
	// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
	// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts
	// at the right opcode case. To prepare for the next opcode after that, the code in NEXT has to fetch the
	// instruction at pc + 4 and then look up the opcode case pointer for the opcode in the fetched instruction.
	// Assuming all the loads hit L1, on Zen 3 the first load would be 4 cycles, then the dependent load (with a
	// complex addressing mode) from the opcode table is 5 cycles, plus the 1 cycle latency from the & 0xFF, so
	// it's going to be at least 9 cycles before the control dependency for the next opcode dispatch is ready.
	// So if that next opcode dispatch will also mispredict, you won't know until 9 cycles later than you could
	// have known if you held the indirect jump target immediately after the restart. So, you can think of the
	// latency of the control dependency chain as being added to the mispredict penalty. Depending on your CPU,
	// the usual number quoted is 15-20 cycles, and 9 cycles is 50% or more of that. On Skylake where the mispredict
	// penalty is 15-16 and L1 loads are slower, it's closer to 100%.

	#define NEXT \
	do { \
	pc += 4; \
	ins = load_ins(pc); \
	goto *opcases[ins & 0xFF]; \
	} while (0)

	case_ADD:
	regs[a] = regs[b] + regs[c];
	NEXT;

	case_JEQ:
	pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
	ins = load_ins(pc);
	goto *opcases[ins & 0xFF];

	// Version 2: Speculative instruction/opcase prefetch. As soon as we enter an opcode case, we know the indirect jump target
	// for the sequentially next PC, which is almost always the next PC (except for control flow opcodes where the branch is taken).
	// This means the effective penalty for a back-to-back opcode dispatch mispredict goes down by the 9+ cycles quoted in the
	// last comment block. One way to think about what's going on here is that if a dispatch mispredict happens, you have 15-20
	// latency cycles of work you can have in flight "for free" which will be ready after the post-mispredict restart with 0 latency.
	// Obviously "for free" has to be heavily qualified. In stop-and-go mispredict-heavy code like this example, your pipeline
	// resources are underutilized (in an absolute and throughput vs goodput sense) and "stealing" from a predicted-but-maybe-wasted
	// code path can be a good trade-off. E.g. if you have a hard-to-predict two-way branch where branch prediction is effectively
	// useless, you can get started on work for _both_ branches in parallel so that whatever branch ends up being the right one, you
	// will have a headstart on the initial wave, especially the latency-sensitive parts.

	#define NEXT \
	do { \
	pc += 4; \
	ins = next_ins; \
	opcase = next_opcase; \
	next_ins = load_ins(pc + 4); \
	next_opcase = opcases[next_ins & 0xFF]; \
	goto *opcase; \
	} while (0)

	case_ADD:
	regs[a] = regs[b] + regs[c];
	NEXT;

	// Variant 1: Branchless, doesn't benefit from prefetched ins/opcase.
	case_JEQ:
	pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
	ins = load_ins(pc);
	opcase = opcases[ins & 0xFF];
	next_ins = load_ins(pc + 4);
	next_opcase = ops[next_ins & 0xFF];
	goto *opcase;

	// Variant 2: Branch-based, predict JEQ not taken, benefits from prefetched ins/opcase when not taken.
	case_JEQ: {
	// Save 9+ cycles (assuming the next 'goto *opcase' mispredicts) when the 'unlikely' path is taken.
	bool taken = regs[b] == regs[c];
	uint32_t taken_pc = pc + (int8_t)a;
	uint32_t taken_ins = load_ins(taken_pc);
	void *taken_opcase = opcases[taken_ins & 0xFF];
	if (unlikely(taken)) {
	pc = taken_pc;
	ins = taken_ins;
	opcase = taken_opcase;
	} else {
	pc += 4;
	ins = next_ins;
	opcase = next_opcase;
	}
	next_ins = load_ins(pc + 4);
	next_opcase = opcases[next_ins & 0xFF];
	goto *opcase;
	}

	// Version 3: Let's go nuts and add a fully associative, unbounded branch target buffer. This means taken branches
	// will be on the hot path as long as their branch targets are consistent. This is mostly just a fun example.
	// In order to avoid adding an extra load's worth of latency to the prefetch path, we store the predicted
	// opcase directly in the BTB alongside the predicted PC. Note that next_opcase's latency is now that of 1 load
	// instead of 2 loads although the BTB load will be colder than the shared opcase table load if the PC is cold.
	// You could also cache the instruction in the BTB to remove the dependent instruction load, but remember we
	// are focused mainly on reducing opcase latency. I'm not sure how practical this all is, but at least it's
	// interesting to contemplate. Caching the predicted opcase in the BTB feels like it belongs somewhere in the
	// family tree of threaded code techniques. A serious implementation of this technique would want to use
	// something less naive than this hysteresis-free "last taken" predictor, but it's the simplest thing you can
	// do beyond statically predicting pc + 4, so it's the first thing students are asked to implement when learning
	// how to design and implement a pipelined CPU in a computer architecture course.

	#define NEXT \
	do { \
	if (likely(next_pc == pc + 4)) { \
	pc = next_pc; \
	ins = next_ins; \
	opcase = next_opcase; \
	} else { \
	uint32_t prev_pc = pc; \
	pc += 4; \
	ins = load_ins(pc); \
	opcase = opcases[ins & 0xFF]; \
	btb_pc[prev_pc] = pc; \
	btb_opcase[prev_pc] = opcase; \
	} \
	next_pc = btb_pc[pc]; \
	next_ins = load_ins(next_pc); \
	next_opcase = btb_opcase[pc]; \
	goto *opcase; \
	} while (0)

	case_ADD:
	// Same as before

	case_JEQ: {
	bool taken = regs[b] == regs[c];
	uint32_t taken_pc = pc + (int8_t)a;
	uint32_t actual_next_pc = select(taken, taken_pc, pc + 4);
	// The condition is written like this to prevent short-circuiting branches.
	if (likely(next_pc == actual_next_pc)) {
	pc = next_pc;
	ins = next_ins;
	opcase = next_opcase;
	} else {
	uint32_t prev_pc = pc;
	// A branchless fallback path (variant 1) seems reasonable but branch-based (variant 2) would work too.
	pc = actual_next_pc;
	ins = load_ins(pc);
	opcase = opcases[ins & 0xFF];
	btb_pc[prev_pc] = pc;
	btb_opcase[prev_pc] = opcase;
	}
	next_pc = btb_pc[pc];
	next_ins = load_ins(next_pc);
	next_opcase = btb_opcase[pc];
	goto *opcase;
	}