Skip to content

Instantly share code, notes, and snippets.

@laiso
Last active June 22, 2025 02:15
Show Gist options
  • Save laiso/9869cd9d48595c13a1a3ff83419dfb56 to your computer and use it in GitHub Desktop.
Save laiso/9869cd9d48595c13a1a3ff83419dfb56 to your computer and use it in GitHub Desktop.
Benchmarks CLI agents (Claude Code, Codex CLI, Goose CLI and Aider) on Exercism TypeScript programming exercises https://github.com/exercism/typescript/tree/main/exercises/practice
#!/usr/bin/env bun
import { spawn } from "bun";
import { join } from "path";
import { readdir } from "fs/promises";
const CLAUDE_CODE_CONTAINER = "cli-agents-benchmark";
const EXERCISM_PRACTICE_PATH = "exercism/typescript/exercises/practice";
const SYSTEM_PROMPT = "'Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution.'";
interface AgentResult {
exercise: string;
success: boolean;
error?: string;
duration: number;
output?: string;
}
interface TestResult {
exercise: string;
agentSuccess: boolean;
testSuccess: boolean;
overallSuccess: boolean;
agentError?: string;
testError?: string;
agentDuration: number;
testDuration: number;
totalDuration: number;
}
interface BenchmarkConfig {
testCommand: string;
agent: string;
model: string;
provider: string;
verbose: boolean;
}
async function getPracticeExercises(): Promise<string[]> {
const practiceDir = join(process.cwd(), EXERCISM_PRACTICE_PATH);
const entries = await readdir(practiceDir, { withFileTypes: true });
return entries
.filter(entry => entry.isDirectory() && !entry.name.startsWith('.'))
.map(entry => entry.name)
.sort();
}
async function getTestFiles(exercisePath: string): Promise<string[]> {
try {
const exerciseDir = join(process.cwd(), exercisePath);
const entries = await readdir(exerciseDir);
return entries.filter(file => file.endsWith('.test.ts'));
} catch (error) {
console.warn(`Warning: Could not read test files from ${exercisePath}`);
return [];
}
}
function buildTestCommand(config: BenchmarkConfig): string {
return config.testCommand;
}
function buildAgentCommand(config: BenchmarkConfig, exercisePath: string): string[] {
const { agent, model, provider } = config;
const baseArgs = ["docker", "run", "--rm", "-i"];
if (agent === 'claude') {
return [
...baseArgs,
"-e", "ANTHROPIC_API_KEY",
"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
"-w", "/workspace",
CLAUDE_CODE_CONTAINER,
"sh", "-c",
`claude --dangerously-skip-permissions --model ${model} -p .docs/instructions.md --system-prompt "${SYSTEM_PROMPT}"`
];
} else if (agent === 'goose') {
return [
...baseArgs,
"-e", `GOOSE_MODEL=${model}`,
"-e", "OPENAI_API_KEY",
"-e", "ANTHROPIC_API_KEY",
"-e", "GOOGLE_API_KEY",
"-e", `GOOSE_PROVIDER=${provider}`,
"-e", "GOOSE_DISABLE_KEYRING=1",
"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
"-w", "/workspace",
CLAUDE_CODE_CONTAINER,
"sh", "-c",
`goose run --with-builtin "developer" -i .docs/instructions.md --system "${SYSTEM_PROMPT}"`
];
} else if (agent === 'aider') {
return [
...baseArgs,
"-e", "OPENAI_API_KEY",
"-e", "ANTHROPIC_API_KEY",
"-e", "GOOGLE_API_KEY",
"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
"-w", "/workspace",
CLAUDE_CODE_CONTAINER,
"sh", "-c",
`aider --yes-always --no-auto-commits --message "${SYSTEM_PROMPT} $(cat .docs/instructions.md)" --file *.ts --read *.test.ts`
];
} else if (agent === 'codex') {
return [
...baseArgs,
"-e", "OPENAI_API_KEY",
"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
"-w", "/workspace",
CLAUDE_CODE_CONTAINER,
"sh", "-c",
`codex exec --full-auto --skip-git-repo-check -m ${model} "$(cat .docs/instructions.md)"`
];
} else {
throw new Error(`Unknown agent: ${agent}`);
}
}
async function runAgentPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
const startTime = Date.now();
try {
const agentArgs = buildAgentCommand(config, exercisePath);
// Add test files as read-only mounts
const testFiles = await getTestFiles(exercisePath);
const mountIndex = agentArgs.findIndex(arg => arg === CLAUDE_CODE_CONTAINER);
testFiles.forEach(testFile => {
agentArgs.splice(mountIndex, 0, "-v", `${join(process.cwd(), exercisePath, testFile)}:/workspace/${testFile}:ro`);
});
if (config.verbose) {
console.log(`πŸ€– Agent command: ${agentArgs.join(" ")}`);
}
const proc = spawn(agentArgs);
await proc.exited;
const duration = Date.now() - startTime;
const stdout = await new Response(proc.stdout).text();
const stderr = await new Response(proc.stderr).text();
if (proc.exitCode === 0) {
console.log(`πŸ€– ${exercise} - Agent Success (${duration}ms)`);
return { exercise, success: true, duration, output: stdout };
} else {
console.log(`πŸ€– ${exercise} - Agent Failed (${duration}ms)`);
if (config.verbose) {
console.log(` Agent STDOUT: ${stdout.slice(0, 500)}...`);
console.log(` Agent STDERR: ${stderr.slice(0, 500)}...`);
}
return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
}
} catch (error) {
const duration = Date.now() - startTime;
const errorMsg = error instanceof Error ? error.message : String(error);
console.log(`πŸ€– ${exercise} - Agent Error (${duration}ms): ${errorMsg}`);
return { exercise, success: false, error: errorMsg, duration };
}
}
async function resetExercise(exercisePath: string, verbose: boolean = false): Promise<void> {
try {
if (verbose) {
console.log(`πŸ”„ Resetting exercise: ${exercisePath}`);
}
const fullExercisePath = join(process.cwd(), exercisePath);
const resetArgs = ["git", "-C", fullExercisePath, "checkout", "HEAD", "--", "."];
const proc = spawn(resetArgs);
await proc.exited;
if (proc.exitCode !== 0) {
const stderr = await new Response(proc.stderr).text();
console.warn(`Warning: Failed to reset ${exercisePath}: ${stderr}`);
} else if (verbose) {
console.log(`βœ… Successfully reset ${exercisePath}`);
}
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
console.warn(`Warning: Git reset failed for ${exercisePath}: ${errorMsg}`);
}
}
async function runTestPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
const startTime = Date.now();
try {
const testCommand = buildTestCommand(config);
let testArgs: string[];
testArgs = [
"docker", "run", "--rm", "-i",
"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
"-w", "/workspace",
CLAUDE_CODE_CONTAINER,
"sh", "-c", testCommand
];
if (config.verbose) {
console.log(`πŸ§ͺ Test command: ${testArgs.join(" ")}`);
}
const proc = spawn(testArgs);
await proc.exited;
const duration = Date.now() - startTime;
const stdout = await new Response(proc.stdout).text();
const stderr = await new Response(proc.stderr).text();
if (proc.exitCode === 0) {
console.log(`πŸ§ͺ ${exercise} - Test Success (${duration}ms)`);
return { exercise, success: true, duration, output: stdout };
} else {
console.log(`πŸ§ͺ ${exercise} - Test Failed (${duration}ms)`);
if (config.verbose) {
console.log(` Test STDOUT: ${stdout.slice(0, 500)}...`);
console.log(` Test STDERR: ${stderr.slice(0, 500)}...`);
}
return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
}
} catch (error) {
const duration = Date.now() - startTime;
const errorMsg = error instanceof Error ? error.message : String(error);
console.log(`πŸ§ͺ ${exercise} - Test Error (${duration}ms): ${errorMsg}`);
return { exercise, success: false, error: errorMsg, duration };
}
}
async function runExercise(config: BenchmarkConfig, exercise: string): Promise<TestResult> {
const startTime = Date.now();
const exercisePath = join(EXERCISM_PRACTICE_PATH, exercise);
console.log(`πŸ§ͺ Starting ${exercise}... (Docker)`);
// Phase 0: Reset exercise to clean state
await resetExercise(exercisePath, config.verbose);
// Phase 1: Run AI Agent
const agentResult = await runAgentPhase(config, exercise, exercisePath);
// Phase 2: Run Tests (always run, even if agent failed)
const testResult = await runTestPhase(config, exercise, exercisePath);
const totalDuration = Date.now() - startTime;
const overallSuccess = agentResult.success && testResult.success;
if (overallSuccess) {
console.log(`βœ… ${exercise} - Overall Success (${totalDuration}ms)`);
} else {
console.log(`❌ ${exercise} - Overall Failed (${totalDuration}ms)`);
if (!agentResult.success) console.log(` πŸ€– Agent failed: ${agentResult.error?.slice(0, 200)}...`);
if (!testResult.success) console.log(` πŸ§ͺ Test failed: ${testResult.error?.slice(0, 200)}...`);
}
return {
exercise,
agentSuccess: agentResult.success,
testSuccess: testResult.success,
overallSuccess,
agentError: agentResult.error,
testError: testResult.error,
agentDuration: agentResult.duration,
testDuration: testResult.duration,
totalDuration
};
}
async function runBenchmark(): Promise<void> {
const modelIndex = process.argv.indexOf('--model');
const model = modelIndex !== -1 && modelIndex + 1 < process.argv.length
? process.argv[modelIndex + 1]
: 'sonnet';
const agentIndex = process.argv.indexOf('--agent');
const agent = agentIndex !== -1 && agentIndex + 1 < process.argv.length
? process.argv[agentIndex + 1]
: 'claude';
const providerIndex = process.argv.indexOf('--provider');
const provider = providerIndex !== -1 && providerIndex + 1 < process.argv.length
? process.argv[providerIndex + 1]
: 'openai';
const verbose = process.argv.includes('--verbose');
const exerciseIndex = process.argv.indexOf('--exercise');
let specificExercise = exerciseIndex !== -1 && exerciseIndex + 1 < process.argv.length
? process.argv[exerciseIndex + 1]
: null;
let exerciseCount: number | null = null;
if (specificExercise && /^\d+$/.test(specificExercise)) {
exerciseCount = parseInt(specificExercise, 10);
specificExercise = null;
}
else if (specificExercise && specificExercise.includes('/')) {
specificExercise = specificExercise.split('/').pop() || null;
}
const listExercises = process.argv.includes('--list');
const allExercises = await getPracticeExercises();
if (listExercises) {
console.log("πŸ“‹ Available Exercism problems:");
allExercises.forEach((exercise, index) => {
console.log(` ${(index + 1).toString().padStart(3)}: ${exercise}`);
});
return;
}
console.log("πŸš€ Starting Exercism TypeScript benchmark");
console.log(`πŸ“‹ Solving TypeScript problems with ${agent} agent (Docker mode, ${model} model)\n`);
let exercises: string[];
if (specificExercise) {
if (!allExercises.includes(specificExercise)) {
console.error(`❌ Specified problem '${specificExercise}' not found`);
console.log("Use --list option to see available problems");
return;
}
exercises = [specificExercise];
console.log(`🎯 Specified problem: ${specificExercise}\n`);
} else if (exerciseCount) {
const count = Math.min(exerciseCount, allExercises.length);
exercises = allExercises.slice(0, count);
console.log(`πŸ”’ Number of problems: ${count} (out of ${allExercises.length})\n`);
} else {
exercises = allExercises.slice(0, 1);
console.log(`πŸ“Š Found problems: ${allExercises.length} (testing only the first one)\n`);
}
const results: TestResult[] = [];
const config: BenchmarkConfig = {
testCommand: 'yarn && yarn test',
agent,
model,
provider,
verbose
};
for (const exercise of exercises) {
const result = await runExercise(config, exercise);
results.push(result);
await new Promise(resolve => setTimeout(resolve, 1000));
}
const successCount = results.filter(r => r.overallSuccess).length;
const totalCount = results.length;
const successRate = (successCount / totalCount) * 100;
const avgDuration = results.reduce((sum, r) => sum + r.totalDuration, 0) / results.length;
const agentSuccessCount = results.filter(r => r.agentSuccess).length;
const testSuccessCount = results.filter(r => r.testSuccess).length;
console.log("\n" + "=".repeat(50));
console.log("πŸ“ˆ Benchmark Results");
console.log("=".repeat(50));
console.log(`🎯 Success Rate: ${successRate.toFixed(1)}% (${successCount}/${totalCount})`);
console.log(`⏱️ Average Duration: ${avgDuration.toFixed(0)}ms`);
console.log(`βœ… Overall Success: ${successCount}`);
console.log(`πŸ€– Agent Success: ${agentSuccessCount}`);
console.log(`πŸ§ͺ Test Success: ${testSuccessCount}`);
console.log(`❌ Failed: ${totalCount - successCount}`);
console.log("\nπŸ“ Detailed Results:");
results.forEach(result => {
const overallStatus = result.overallSuccess ? "βœ…" : "❌";
const agentStatus = result.agentSuccess ? "πŸ€–" : "❌";
const testStatus = result.testSuccess ? "πŸ§ͺ" : "❌";
const duration = `${result.totalDuration}ms`;
console.log(` ${overallStatus} ${result.exercise.padEnd(25)} ${duration} (${agentStatus}${testStatus})`);
});
if (results.some(r => !r.overallSuccess)) {
console.log("\nπŸ” Errors for failed problems:");
results.filter(r => !r.overallSuccess).forEach(result => {
console.log(` ❌ ${result.exercise}:`);
if (result.agentError) {
console.log(` πŸ€– Agent: ${result.agentError.slice(0, 500)}${result.agentError.length > 500 ? '...' : ''}`);
}
if (result.testError) {
console.log(` πŸ§ͺ Test: ${result.testError.slice(0, 500)}${result.testError.length > 500 ? '...' : ''}`);
}
});
}
}
if (import.meta.main) {
runBenchmark().catch(console.error);
}
FROM node:22
ARG TZ
ENV TZ="$TZ"
# Install basic development tools and iptables/ipset
RUN apt update && apt install -y less \
git \
procps \
sudo \
fzf \
zsh \
man-db \
unzip \
gnupg2 \
gh \
iptables \
ipset \
iproute2 \
dnsutils \
aggregate \
ripgrep \
jq
# Ensure default node user has access to /usr/local/share
RUN mkdir -p /usr/local/share/npm-global && \
chown -R node:node /usr/local/share
ARG USERNAME=node
WORKDIR /workspace
RUN mkdir -p /workspace && \
chown -R node:node /workspace
# Enable corepack for yarn version management (as root)
ENV COREPACK_ENABLE_DOWNLOAD_PROMPT=0
RUN corepack enable && corepack prepare yarn@stable --activate
# Set up yarn and corepack environment
ENV YARN_CACHE_FOLDER=/home/node/.yarn/cache
ENV YARN_GLOBAL_FOLDER=/home/node/.yarn/global
RUN mkdir -p /home/node/.yarn/cache /home/node/.yarn/global /home/node/.cache/node/corepack && \
chown -R node:node /home/node/.yarn /home/node/.cache
# Install global packages
ENV NPM_CONFIG_PREFIX=/usr/local/share/npm-global
ENV PATH=$PATH:/usr/local/share/npm-global/bin
# Set up non-root user
USER node
# Install Claude Code
RUN npm install -g @anthropic-ai/claude-code
RUN mkdir -p /home/node/.claude
# Install Codex CLI(Native)
RUN npm install -g @openai/codex@native
ENV CODEX_RUST=1
RUN mkdir -p $HOME/.codex && \
echo "Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution." > AGENTS.md
# Install Goose CLI
RUN curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash
ENV HOME=/home/node
ENV PATH=$HOME/.local/bin:$PATH
# Install Aider
RUN curl -LsSf https://aider.chat/install.sh | sh
ENV AIDER_GIT=false
ENV AIDER_AUTO_COMMITS=false
ENV AIDER_SHOW_RELEASE_NOTES=false
ENV AIDER_SKIP_SANITY_CHECK_REPO=true
ENV AIDER_CHAT_HISTORY_FILE=""
ENV AIDER_INPUT_HISTORY_FILE=""
❯ bun run src/index.ts --agent aider --model gpt-4.1-mini --exercise 100 --verbose

==================================================
πŸ“ˆ Benchmark Results
==================================================
🎯 Success Rate: 76.0% (76/100)
⏱  Average Duration: 169.3s
βœ… Overall Success: 76
πŸ€– Agent Success: 99
πŸ§ͺ Test Success: 76
❌ Failed: 24

πŸ“ Detailed Results:
  βœ… accumulate                41.8s (πŸ€–πŸ§ͺ)
  ❌ acronym                   51.6s (πŸ€–βŒ)
  βœ… all-your-base             96.3s (πŸ€–πŸ§ͺ)
  βœ… allergies                 52.2s (πŸ€–πŸ§ͺ)
  βœ… alphametics               83.3s (πŸ€–πŸ§ͺ)
  ❌ anagram                   42.3s (πŸ€–βŒ)
  βœ… armstrong-numbers         74.2s (πŸ€–πŸ§ͺ)
  βœ… atbash-cipher             52.4s (πŸ€–πŸ§ͺ)
  ❌ bank-account              83.8s (πŸ€–βŒ)
  βœ… beer-song                 45.2s (πŸ€–πŸ§ͺ)
  ❌ binary-search             39.9s (πŸ€–βŒ)
  ❌ binary-search-tree        44.6s (πŸ€–βŒ)
  βœ… bob                       48.7s (πŸ€–πŸ§ͺ)
  ❌ bowling                   1030.1s (πŸ€–βŒ)
  βœ… circular-buffer           54.2s (πŸ€–πŸ§ͺ)
  βœ… clock                     2892.3s (πŸ€–πŸ§ͺ)
  βœ… collatz-conjecture        960.5s (πŸ€–πŸ§ͺ)
  ❌ complex-numbers           55.9s (πŸ€–βŒ)
  ❌ connect                   2113.2s (πŸ€–βŒ)
  ❌ crypto-square             42.9s (πŸ€–βŒ)
  βœ… custom-set                1012.6s (πŸ€–πŸ§ͺ)
  βœ… darts                     167.3s (πŸ€–πŸ§ͺ)
  ❌ diamond                   104.3s (πŸ€–βŒ)
  βœ… difference-of-squares     948.9s (πŸ€–πŸ§ͺ)
  βœ… diffie-hellman            56.2s (πŸ€–πŸ§ͺ)
  ❌ dnd-character             50.2s (πŸ€–βŒ)
  βœ… eliuds-eggs               41.8s (πŸ€–πŸ§ͺ)
  βœ… etl                       42.9s (πŸ€–πŸ§ͺ)
  ❌ flatten-array             37.7s (πŸ€–βŒ)
  ❌ food-chain                83.5s (πŸ€–βŒ)
  βœ… game-of-life              43.2s (πŸ€–πŸ§ͺ)
  βœ… gigasecond                44.3s (πŸ€–πŸ§ͺ)
  βœ… grade-school              50.4s (πŸ€–πŸ§ͺ)
  βœ… grains                    44.3s (πŸ€–πŸ§ͺ)
  βœ… hamming                   42.0s (πŸ€–πŸ§ͺ)
  βœ… hello-world               68.1s (πŸ€–πŸ§ͺ)
  ❌ house                     62.8s (πŸ€–βŒ)
  βœ… isbn-verifier             93.1s (πŸ€–πŸ§ͺ)
  βœ… isogram                   81.2s (πŸ€–πŸ§ͺ)
  βœ… kindergarten-garden       47.6s (πŸ€–πŸ§ͺ)
  βœ… knapsack                  48.1s (πŸ€–πŸ§ͺ)
  βœ… largest-series-product    48.2s (πŸ€–πŸ§ͺ)
  βœ… leap                      40.3s (πŸ€–πŸ§ͺ)
  βœ… linked-list               65.1s (πŸ€–πŸ§ͺ)
  βœ… list-ops                  64.2s (πŸ€–πŸ§ͺ)
  βœ… luhn                      59.9s (πŸ€–πŸ§ͺ)
  βœ… matching-brackets         45.3s (πŸ€–πŸ§ͺ)
  βœ… matrix                    45.8s (πŸ€–πŸ§ͺ)
  βœ… minesweeper               49.1s (πŸ€–πŸ§ͺ)
  βœ… nth-prime                 42.6s (πŸ€–πŸ§ͺ)
  βœ… nucleotide-count          42.0s (πŸ€–πŸ§ͺ)
  βœ… ocr-numbers               52.8s (πŸ€–πŸ§ͺ)
  βœ… palindrome-products       81.1s (πŸ€–πŸ§ͺ)
  βœ… pangram                   44.3s (πŸ€–πŸ§ͺ)
  ❌ pascals-triangle          141.9s (❌❌)
  βœ… perfect-numbers           82.5s (πŸ€–πŸ§ͺ)
  βœ… phone-number              69.4s (πŸ€–πŸ§ͺ)
  βœ… pig-latin                 53.8s (πŸ€–πŸ§ͺ)
  βœ… prime-factors             44.2s (πŸ€–πŸ§ͺ)
  βœ… protein-translation       92.3s (πŸ€–πŸ§ͺ)
  βœ… proverb                   82.4s (πŸ€–πŸ§ͺ)
  βœ… pythagorean-triplet       54.3s (πŸ€–πŸ§ͺ)
  βœ… queen-attack              72.4s (πŸ€–πŸ§ͺ)
  βœ… raindrops                 78.5s (πŸ€–πŸ§ͺ)
  ❌ rational-numbers          58.6s (πŸ€–βŒ)
  ❌ react                     122.0s (πŸ€–βŒ)
  ❌ rectangles                53.9s (πŸ€–βŒ)
  ❌ relative-distance         74.6s (πŸ€–βŒ)
  βœ… resistor-color            76.6s (πŸ€–πŸ§ͺ)
  βœ… resistor-color-duo        42.9s (πŸ€–πŸ§ͺ)
  βœ… resistor-color-trio       75.6s (πŸ€–πŸ§ͺ)
  βœ… reverse-string            40.9s (πŸ€–πŸ§ͺ)
  βœ… rna-transcription         42.4s (πŸ€–πŸ§ͺ)
  ❌ robot-name                47.5s (πŸ€–βŒ)
  βœ… robot-simulator           53.7s (πŸ€–πŸ§ͺ)
  βœ… roman-numerals            52.5s (πŸ€–πŸ§ͺ)
  βœ… rotational-cipher         78.9s (πŸ€–πŸ§ͺ)
  βœ… run-length-encoding       75.3s (πŸ€–πŸ§ͺ)
  βœ… saddle-points             56.9s (πŸ€–πŸ§ͺ)
  βœ… say                       53.2s (πŸ€–πŸ§ͺ)
  βœ… scrabble-score            54.8s (πŸ€–πŸ§ͺ)
  βœ… secret-handshake          345.6s (πŸ€–πŸ§ͺ)
  βœ… series                    974.5s (πŸ€–πŸ§ͺ)
  βœ… sieve                     949.0s (πŸ€–πŸ§ͺ)
  βœ… simple-cipher             90.7s (πŸ€–πŸ§ͺ)
  βœ… space-age                 114.8s (πŸ€–πŸ§ͺ)
  ❌ spiral-matrix             60.0s (πŸ€–βŒ)
  βœ… square-root               76.8s (πŸ€–πŸ§ͺ)
  βœ… strain                    80.3s (πŸ€–πŸ§ͺ)
  βœ… sublist                   50.1s (πŸ€–πŸ§ͺ)
  βœ… sum-of-multiples          42.5s (πŸ€–πŸ§ͺ)
  βœ… tournament                58.5s (πŸ€–πŸ§ͺ)
  ❌ transpose                 80.7s (πŸ€–βŒ)
  βœ… triangle                  78.9s (πŸ€–πŸ§ͺ)
  βœ… twelve-days               49.6s (πŸ€–πŸ§ͺ)
  ❌ two-bucket                79.2s (πŸ€–βŒ)
  βœ… two-fer                   42.9s (πŸ€–πŸ§ͺ)
  ❌ variable-length-quantity  61.9s (πŸ€–βŒ)
  βœ… word-count                47.0s (πŸ€–πŸ§ͺ)
  βœ… word-search               56.7s (πŸ€–πŸ§ͺ)
>❯ bun run cli-agents-benchmark.ts --verbose  --agent goose --provider google --model gemini-2.5-flash --exercise 100

==================================================
πŸ“ˆ Benchmark Results
==================================================
🎯 Success Rate: 96.0% (96/100)
⏱  Average Duration: 102.2s
βœ… Overall Success: 96
πŸ€– Agent Success: 100
πŸ§ͺ Test Success: 96
❌ Failed: 4

πŸ“ Detailed Results:
  βœ… accumulate                56.1s (πŸ€–πŸ§ͺ)
  βœ… acronym                   97.3s (πŸ€–πŸ§ͺ)
  βœ… all-your-base             76.8s (πŸ€–πŸ§ͺ)
  βœ… allergies                 64.5s (πŸ€–πŸ§ͺ)
  βœ… alphametics               88.2s (πŸ€–πŸ§ͺ)
  ❌ anagram                   62.2s (πŸ€–βŒ)
  βœ… armstrong-numbers         60.8s (πŸ€–πŸ§ͺ)
  βœ… atbash-cipher             70.6s (πŸ€–πŸ§ͺ)
  βœ… bank-account              61.3s (πŸ€–πŸ§ͺ)
  βœ… beer-song                 88.1s (πŸ€–πŸ§ͺ)
  βœ… binary-search             64.7s (πŸ€–πŸ§ͺ)
  βœ… binary-search-tree        73.1s (πŸ€–πŸ§ͺ)
  βœ… bob                       106.4s (πŸ€–πŸ§ͺ)
  ❌ bowling                   169.6s (πŸ€–βŒ)
  βœ… circular-buffer           69.3s (πŸ€–πŸ§ͺ)
  βœ… clock                     79.3s (πŸ€–πŸ§ͺ)
  βœ… collatz-conjecture        77.1s (πŸ€–πŸ§ͺ)
  βœ… complex-numbers           177.7s (πŸ€–πŸ§ͺ)
  βœ… connect                   491.3s (πŸ€–πŸ§ͺ)
  ❌ crypto-square             84.4s (πŸ€–βŒ)
  βœ… custom-set                109.4s (πŸ€–πŸ§ͺ)
  βœ… darts                     70.1s (πŸ€–πŸ§ͺ)
  βœ… diamond                   138.3s (πŸ€–πŸ§ͺ)
  βœ… difference-of-squares     92.7s (πŸ€–πŸ§ͺ)
  βœ… diffie-hellman            102.0s (πŸ€–πŸ§ͺ)
  βœ… dnd-character             86.3s (πŸ€–πŸ§ͺ)
  βœ… eliuds-eggs               69.5s (πŸ€–πŸ§ͺ)
  βœ… etl                       70.4s (πŸ€–πŸ§ͺ)
  βœ… flatten-array             68.9s (πŸ€–πŸ§ͺ)
  βœ… food-chain                280.0s (πŸ€–πŸ§ͺ)
  βœ… game-of-life              66.6s (πŸ€–πŸ§ͺ)
  βœ… gigasecond                84.1s (πŸ€–πŸ§ͺ)
  βœ… grade-school              85.9s (πŸ€–πŸ§ͺ)
  βœ… grains                    77.1s (πŸ€–πŸ§ͺ)
  βœ… hamming                   72.3s (πŸ€–πŸ§ͺ)
  βœ… hello-world               67.5s (πŸ€–πŸ§ͺ)
  βœ… house                     91.1s (πŸ€–πŸ§ͺ)
  βœ… isbn-verifier             71.7s (πŸ€–πŸ§ͺ)
  βœ… isogram                   68.6s (πŸ€–πŸ§ͺ)
  βœ… kindergarten-garden       107.8s (πŸ€–πŸ§ͺ)
  βœ… knapsack                  70.8s (πŸ€–πŸ§ͺ)
  βœ… largest-series-product    72.7s (πŸ€–πŸ§ͺ)
  βœ… leap                      70.0s (πŸ€–πŸ§ͺ)
  βœ… linked-list               71.7s (πŸ€–πŸ§ͺ)
  βœ… list-ops                  104.4s (πŸ€–πŸ§ͺ)
  βœ… luhn                      74.0s (πŸ€–πŸ§ͺ)
  βœ… matching-brackets         96.8s (πŸ€–πŸ§ͺ)
  βœ… matrix                    96.1s (πŸ€–πŸ§ͺ)
  βœ… minesweeper               81.0s (πŸ€–πŸ§ͺ)
  βœ… nth-prime                 74.1s (πŸ€–πŸ§ͺ)
  βœ… nucleotide-count          70.3s (πŸ€–πŸ§ͺ)
  βœ… ocr-numbers               582.5s (πŸ€–πŸ§ͺ)
  βœ… palindrome-products       161.3s (πŸ€–πŸ§ͺ)
  βœ… pangram                   73.4s (πŸ€–πŸ§ͺ)
  βœ… pascals-triangle          77.0s (πŸ€–πŸ§ͺ)
  βœ… perfect-numbers           78.2s (πŸ€–πŸ§ͺ)
  βœ… phone-number              87.4s (πŸ€–πŸ§ͺ)
  βœ… pig-latin                 121.3s (πŸ€–πŸ§ͺ)
  βœ… prime-factors             75.2s (πŸ€–πŸ§ͺ)
  βœ… protein-translation       70.7s (πŸ€–πŸ§ͺ)
  βœ… proverb                   154.3s (πŸ€–πŸ§ͺ)
  βœ… pythagorean-triplet       264.7s (πŸ€–πŸ§ͺ)
  βœ… queen-attack              92.9s (πŸ€–πŸ§ͺ)
  βœ… raindrops                 71.5s (πŸ€–πŸ§ͺ)
  βœ… rational-numbers          95.0s (πŸ€–πŸ§ͺ)
  βœ… react                     297.5s (πŸ€–πŸ§ͺ)
  βœ… rectangles                96.5s (πŸ€–πŸ§ͺ)
  βœ… relative-distance         86.3s (πŸ€–πŸ§ͺ)
  βœ… resistor-color            79.2s (πŸ€–πŸ§ͺ)
  βœ… resistor-color-duo        72.4s (πŸ€–πŸ§ͺ)
  βœ… resistor-color-trio       75.7s (πŸ€–πŸ§ͺ)
  βœ… reverse-string            71.3s (πŸ€–πŸ§ͺ)
  βœ… rna-transcription         72.0s (πŸ€–πŸ§ͺ)
  βœ… robot-name                99.9s (πŸ€–πŸ§ͺ)
  ❌ robot-simulator           92.0s (πŸ€–βŒ)
  βœ… roman-numerals            79.1s (πŸ€–πŸ§ͺ)
  βœ… rotational-cipher         73.1s (πŸ€–πŸ§ͺ)
  βœ… run-length-encoding       73.5s (πŸ€–πŸ§ͺ)
  βœ… saddle-points             94.1s (πŸ€–πŸ§ͺ)
  βœ… say                       90.3s (πŸ€–πŸ§ͺ)
  βœ… scrabble-score            68.9s (πŸ€–πŸ§ͺ)
  βœ… secret-handshake          73.8s (πŸ€–πŸ§ͺ)
  βœ… series                    96.0s (πŸ€–πŸ§ͺ)
  βœ… sieve                     76.5s (πŸ€–πŸ§ͺ)
  βœ… simple-cipher             104.8s (πŸ€–πŸ§ͺ)
  βœ… space-age                 73.5s (πŸ€–πŸ§ͺ)
  βœ… spiral-matrix             77.5s (πŸ€–πŸ§ͺ)
  βœ… square-root               65.9s (πŸ€–πŸ§ͺ)
  βœ… strain                    71.8s (πŸ€–πŸ§ͺ)
  βœ… sublist                   85.6s (πŸ€–πŸ§ͺ)
  βœ… sum-of-multiples          72.6s (πŸ€–πŸ§ͺ)
  βœ… tournament                157.6s (πŸ€–πŸ§ͺ)
  βœ… transpose                 192.5s (πŸ€–πŸ§ͺ)
  βœ… triangle                  74.9s (πŸ€–πŸ§ͺ)
  βœ… twelve-days               99.6s (πŸ€–πŸ§ͺ)
  βœ… two-bucket                101.3s (πŸ€–πŸ§ͺ)
  βœ… two-fer                   74.3s (πŸ€–πŸ§ͺ)
  βœ… variable-length-quantity  124.7s (πŸ€–πŸ§ͺ)
  βœ… word-count                75.7s (πŸ€–πŸ§ͺ)
  βœ… word-search               115.7s (πŸ€–πŸ§ͺ)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment