laiso · June 22, 2025 02:15
diff --git a/cli-agents-benchmark.ts b/cli-agents-benchmark.ts
 #!/usr/bin/env bun

 import { spawn } from "bun";
 import { join } from "path";
 import { readdir } from "fs/promises";

 const CLAUDE_CODE_CONTAINER = "cli-agents-benchmark";

 const EXERCISM_PRACTICE_PATH = "exercism/typescript/exercises/practice";
 const SYSTEM_PROMPT = "'Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution.'";

 interface AgentResult {
    exercise: string;
    success: boolean;
    error?: string;
    duration: number;
    output?: string;
 }

 interface TestResult {
    exercise: string;
    agentSuccess: boolean;
    testSuccess: boolean;
    overallSuccess: boolean;
    agentError?: string;
    testError?: string;
    agentDuration: number;
    testDuration: number;
    totalDuration: number;
 }

 interface BenchmarkConfig {
    testCommand: string;
    agent: string;
    model: string;
    provider: string;
    verbose: boolean;
 }

 async function getPracticeExercises(): Promise<string[]> {
    const practiceDir = join(process.cwd(), EXERCISM_PRACTICE_PATH);
    const entries = await readdir(practiceDir, { withFileTypes: true });
    return entries
        .filter(entry => entry.isDirectory() && !entry.name.startsWith('.'))
        .map(entry => entry.name)
        .sort();
 }


 async function getTestFiles(exercisePath: string): Promise<string[]> {
    try {
        const exerciseDir = join(process.cwd(), exercisePath);
        const entries = await readdir(exerciseDir);
        return entries.filter(file => file.endsWith('.test.ts'));
    } catch (error) {
        console.warn(`Warning: Could not read test files from ${exercisePath}`);
        return [];
    }
 }

 function buildTestCommand(config: BenchmarkConfig): string {
    return config.testCommand;
 }

 function buildAgentCommand(config: BenchmarkConfig, exercisePath: string): string[] {
    const { agent, model, provider } = config;
    const baseArgs = ["docker", "run", "--rm", "-i"];
    
    if (agent === 'claude') {
        return [
            ...baseArgs,
            "-e", "ANTHROPIC_API_KEY",
            "-v", `${join(process.cwd(), exercisePath)}:/workspace`,
            "-w", "/workspace",
            CLAUDE_CODE_CONTAINER,
            "sh", "-c",
            `claude --dangerously-skip-permissions --model ${model} -p .docs/instructions.md --system-prompt "${SYSTEM_PROMPT}"`
        ];
    } else if (agent === 'goose') {
        return [
            ...baseArgs,
            "-e", `GOOSE_MODEL=${model}`,
            "-e", "OPENAI_API_KEY",
            "-e", "ANTHROPIC_API_KEY",
            "-e", "GOOGLE_API_KEY",
            "-e", `GOOSE_PROVIDER=${provider}`,
            "-e", "GOOSE_DISABLE_KEYRING=1",
            "-v", `${join(process.cwd(), exercisePath)}:/workspace`,
            "-w", "/workspace",
            CLAUDE_CODE_CONTAINER,
            "sh", "-c",
            `goose run --with-builtin "developer" -i .docs/instructions.md --system "${SYSTEM_PROMPT}"`
        ];
    } else if (agent === 'aider') {
        return [
            ...baseArgs,
            "-e", "OPENAI_API_KEY",
            "-e", "ANTHROPIC_API_KEY",
            "-e", "GOOGLE_API_KEY",
            "-v", `${join(process.cwd(), exercisePath)}:/workspace`,
            "-w", "/workspace",
            CLAUDE_CODE_CONTAINER,
            "sh", "-c",
            `aider --yes-always --no-auto-commits --message "${SYSTEM_PROMPT} $(cat .docs/instructions.md)" --file *.ts --read *.test.ts`
        ];
    } else if (agent === 'codex') {
        return [
            ...baseArgs,
            "-e", "OPENAI_API_KEY",
            "-v", `${join(process.cwd(), exercisePath)}:/workspace`,
            "-w", "/workspace",
            CLAUDE_CODE_CONTAINER,
            "sh", "-c",
            `codex exec --full-auto --skip-git-repo-check -m ${model} "$(cat .docs/instructions.md)"`
        ];
    } else {
        throw new Error(`Unknown agent: ${agent}`);
    }
 }

 async function runAgentPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
    const startTime = Date.now();
    
    try {
        const agentArgs = buildAgentCommand(config, exercisePath);
        
        // Add test files as read-only mounts
        const testFiles = await getTestFiles(exercisePath);
        const mountIndex = agentArgs.findIndex(arg => arg === CLAUDE_CODE_CONTAINER);
        
        testFiles.forEach(testFile => {
            agentArgs.splice(mountIndex, 0, "-v", `${join(process.cwd(), exercisePath, testFile)}:/workspace/${testFile}:ro`);
        });
        
        
        if (config.verbose) {
            console.log(`🤖 Agent command: ${agentArgs.join(" ")}`);
        }
        
        const proc = spawn(agentArgs);
        await proc.exited;
        
        const duration = Date.now() - startTime;
        const stdout = await new Response(proc.stdout).text();
        const stderr = await new Response(proc.stderr).text();
        
        if (proc.exitCode === 0) {
            console.log(`🤖 ${exercise} - Agent Success (${duration}ms)`);
            return { exercise, success: true, duration, output: stdout };
        } else {
            console.log(`🤖 ${exercise} - Agent Failed (${duration}ms)`);
            if (config.verbose) {
                console.log(`  Agent STDOUT: ${stdout.slice(0, 500)}...`);
                console.log(`  Agent STDERR: ${stderr.slice(0, 500)}...`);
            }
            return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
        }
    } catch (error) {
        const duration = Date.now() - startTime;
        const errorMsg = error instanceof Error ? error.message : String(error);
        console.log(`🤖 ${exercise} - Agent Error (${duration}ms): ${errorMsg}`);
        return { exercise, success: false, error: errorMsg, duration };
    }
 }

 async function resetExercise(exercisePath: string, verbose: boolean = false): Promise<void> {
    try {
        if (verbose) {
            console.log(`🔄 Resetting exercise: ${exercisePath}`);
        }
        
        const fullExercisePath = join(process.cwd(), exercisePath);
        const resetArgs = ["git", "-C", fullExercisePath, "checkout", "HEAD", "--", "."];
        const proc = spawn(resetArgs);
        await proc.exited;

        if (proc.exitCode !== 0) {
            const stderr = await new Response(proc.stderr).text();
            console.warn(`Warning: Failed to reset ${exercisePath}: ${stderr}`);
        } else if (verbose) {
            console.log(`✅ Successfully reset ${exercisePath}`);
        }
    } catch (error) {
        const errorMsg = error instanceof Error ? error.message : String(error);
        console.warn(`Warning: Git reset failed for ${exercisePath}: ${errorMsg}`);
    }
 }

 async function runTestPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
    const startTime = Date.now();
    
    try {
        const testCommand = buildTestCommand(config);
        let testArgs: string[];
        
        testArgs = [
            "docker", "run", "--rm", "-i",
            "-v", `${join(process.cwd(), exercisePath)}:/workspace`,
            "-w", "/workspace",
            CLAUDE_CODE_CONTAINER,
            "sh", "-c", testCommand
        ];
        
        if (config.verbose) {
            console.log(`🧪 Test command: ${testArgs.join(" ")}`);
        }
        
        const proc = spawn(testArgs);
        await proc.exited;
        
        const duration = Date.now() - startTime;
        const stdout = await new Response(proc.stdout).text();
        const stderr = await new Response(proc.stderr).text();
        
        if (proc.exitCode === 0) {
            console.log(`🧪 ${exercise} - Test Success (${duration}ms)`);
            return { exercise, success: true, duration, output: stdout };
        } else {
            console.log(`🧪 ${exercise} - Test Failed (${duration}ms)`);
            if (config.verbose) {
                console.log(`  Test STDOUT: ${stdout.slice(0, 500)}...`);
                console.log(`  Test STDERR: ${stderr.slice(0, 500)}...`);
            }
            return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
        }
    } catch (error) {
        const duration = Date.now() - startTime;
        const errorMsg = error instanceof Error ? error.message : String(error);
        console.log(`🧪 ${exercise} - Test Error (${duration}ms): ${errorMsg}`);
        return { exercise, success: false, error: errorMsg, duration };
    }
 }

 async function runExercise(config: BenchmarkConfig, exercise: string): Promise<TestResult> {
    const startTime = Date.now();
    const exercisePath = join(EXERCISM_PRACTICE_PATH, exercise);
    
    console.log(`🧪 Starting ${exercise}... (Docker)`);
    
    // Phase 0: Reset exercise to clean state
    await resetExercise(exercisePath, config.verbose);
    
    // Phase 1: Run AI Agent
    const agentResult = await runAgentPhase(config, exercise, exercisePath);
    
    // Phase 2: Run Tests (always run, even if agent failed)
    const testResult = await runTestPhase(config, exercise, exercisePath);
    
    const totalDuration = Date.now() - startTime;
    const overallSuccess = agentResult.success && testResult.success;
    
    if (overallSuccess) {
        console.log(`✅ ${exercise} - Overall Success (${totalDuration}ms)`);
    } else {
        console.log(`❌ ${exercise} - Overall Failed (${totalDuration}ms)`);
        if (!agentResult.success) console.log(`  🤖 Agent failed: ${agentResult.error?.slice(0, 200)}...`);
        if (!testResult.success) console.log(`  🧪 Test failed: ${testResult.error?.slice(0, 200)}...`);
    }
    
    return {
        exercise,
        agentSuccess: agentResult.success,
        testSuccess: testResult.success,
        overallSuccess,
        agentError: agentResult.error,
        testError: testResult.error,
        agentDuration: agentResult.duration,
        testDuration: testResult.duration,
        totalDuration
    };
 }

 async function runBenchmark(): Promise<void> {
    
    const modelIndex = process.argv.indexOf('--model');
    const model = modelIndex !== -1 && modelIndex + 1 < process.argv.length 
        ? process.argv[modelIndex + 1] 
        : 'sonnet';
    
    const agentIndex = process.argv.indexOf('--agent');
    const agent = agentIndex !== -1 && agentIndex + 1 < process.argv.length 
        ? process.argv[agentIndex + 1] 
        : 'claude';

    const providerIndex = process.argv.indexOf('--provider');
    const provider = providerIndex !== -1 && providerIndex + 1 < process.argv.length 
        ? process.argv[providerIndex + 1] 
        : 'openai';

    const verbose = process.argv.includes('--verbose');

    const exerciseIndex = process.argv.indexOf('--exercise');
    let specificExercise = exerciseIndex !== -1 && exerciseIndex + 1 < process.argv.length 
        ? process.argv[exerciseIndex + 1] 
        : null;
    
    let exerciseCount: number | null = null;
    
    if (specificExercise && /^\d+$/.test(specificExercise)) {
        exerciseCount = parseInt(specificExercise, 10);
        specificExercise = null;
    }
    else if (specificExercise && specificExercise.includes('/')) {
        specificExercise = specificExercise.split('/').pop() || null;
    }

    const listExercises = process.argv.includes('--list');
    
    const allExercises = await getPracticeExercises();
    
    if (listExercises) {
        console.log("📋 Available Exercism problems:");
        allExercises.forEach((exercise, index) => {
            console.log(`  ${(index + 1).toString().padStart(3)}: ${exercise}`);
        });
        return;
    }
    
    console.log("🚀 Starting Exercism TypeScript benchmark");
    console.log(`📋 Solving TypeScript problems with ${agent} agent (Docker mode, ${model} model)\n`);
    
    let exercises: string[];
    if (specificExercise) {
        if (!allExercises.includes(specificExercise)) {
            console.error(`❌ Specified problem '${specificExercise}' not found`);
            console.log("Use --list option to see available problems");
            return;
        }
        exercises = [specificExercise];
        console.log(`🎯 Specified problem: ${specificExercise}\n`);
    } else if (exerciseCount) {
        const count = Math.min(exerciseCount, allExercises.length);
        exercises = allExercises.slice(0, count);
        console.log(`🔢 Number of problems: ${count} (out of ${allExercises.length})\n`);
    } else {
        exercises = allExercises.slice(0, 1);
        console.log(`📊 Found problems: ${allExercises.length} (testing only the first one)\n`);
    }
    
    const results: TestResult[] = [];
    
    const config: BenchmarkConfig = {
        testCommand: 'yarn && yarn test',
        agent,
        model, 
        provider,
        verbose
    };
    
    for (const exercise of exercises) {
        const result = await runExercise(config, exercise);
        results.push(result);
        await new Promise(resolve => setTimeout(resolve, 1000));
    }
    
    const successCount = results.filter(r => r.overallSuccess).length;
    const totalCount = results.length;
    const successRate = (successCount / totalCount) * 100;
    const avgDuration = results.reduce((sum, r) => sum + r.totalDuration, 0) / results.length;
    const agentSuccessCount = results.filter(r => r.agentSuccess).length;
    const testSuccessCount = results.filter(r => r.testSuccess).length;
    
    console.log("\n" + "=".repeat(50));
    console.log("📈 Benchmark Results");
    console.log("=".repeat(50));
    console.log(`🎯 Success Rate: ${successRate.toFixed(1)}% (${successCount}/${totalCount})`);
    console.log(`⏱️  Average Duration: ${avgDuration.toFixed(0)}ms`);
    console.log(`✅ Overall Success: ${successCount}`);
    console.log(`🤖 Agent Success: ${agentSuccessCount}`);
    console.log(`🧪 Test Success: ${testSuccessCount}`);
    console.log(`❌ Failed: ${totalCount - successCount}`);
    
    console.log("\n📝 Detailed Results:");
    results.forEach(result => {
        const overallStatus = result.overallSuccess ? "✅" : "❌";
        const agentStatus = result.agentSuccess ? "🤖" : "❌";
        const testStatus = result.testSuccess ? "🧪" : "❌";
        const duration = `${result.totalDuration}ms`;
        console.log(`  ${overallStatus} ${result.exercise.padEnd(25)} ${duration} (${agentStatus}${testStatus})`);
    });
    
    if (results.some(r => !r.overallSuccess)) {
        console.log("\n🔍 Errors for failed problems:");
        results.filter(r => !r.overallSuccess).forEach(result => {
            console.log(`  ❌ ${result.exercise}:`);
            if (result.agentError) {
                console.log(`     🤖 Agent: ${result.agentError.slice(0, 500)}${result.agentError.length > 500 ? '...' : ''}`);
            }
            if (result.testError) {
                console.log(`     🧪 Test: ${result.testError.slice(0, 500)}${result.testError.length > 500 ? '...' : ''}`);
            }
        });
    }
 }

 if (import.meta.main) {
    runBenchmark().catch(console.error);
 }
diff --git a/Dockerfile b/Dockerfile
 FROM node:22

 ARG TZ
 ENV TZ="$TZ"

 # Install basic development tools and iptables/ipset
 RUN apt update && apt install -y less \
  git \
  procps \
  sudo \
  fzf \
  zsh \
  man-db \
  unzip \
  gnupg2 \
  gh \
  iptables \
  ipset \
  iproute2 \
  dnsutils \
  aggregate \
  ripgrep \
  jq

 # Ensure default node user has access to /usr/local/share
 RUN mkdir -p /usr/local/share/npm-global && \
  chown -R node:node /usr/local/share

 ARG USERNAME=node

 WORKDIR /workspace
 RUN mkdir -p /workspace && \
  chown -R node:node /workspace

 # Enable corepack for yarn version management (as root)
 ENV COREPACK_ENABLE_DOWNLOAD_PROMPT=0
 RUN corepack enable && corepack prepare yarn@stable --activate
 # Set up yarn and corepack environment
 ENV YARN_CACHE_FOLDER=/home/node/.yarn/cache
 ENV YARN_GLOBAL_FOLDER=/home/node/.yarn/global
 RUN mkdir -p /home/node/.yarn/cache /home/node/.yarn/global /home/node/.cache/node/corepack && \
    chown -R node:node /home/node/.yarn /home/node/.cache

 # Install global packages
 ENV NPM_CONFIG_PREFIX=/usr/local/share/npm-global
 ENV PATH=$PATH:/usr/local/share/npm-global/bin

 # Set up non-root user
 USER node

 # Install Claude Code
 RUN npm install -g @anthropic-ai/claude-code
 RUN mkdir -p /home/node/.claude

 # Install Codex CLI(Native)
 RUN npm install -g @openai/codex@native
 ENV CODEX_RUST=1
 RUN mkdir -p $HOME/.codex && \
    echo "Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution." > AGENTS.md

 # Install Goose CLI
 RUN curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash
 ENV HOME=/home/node
 ENV PATH=$HOME/.local/bin:$PATH

 # Install Aider
 RUN curl -LsSf https://aider.chat/install.sh | sh
 ENV AIDER_GIT=false
 ENV AIDER_AUTO_COMMITS=false
 ENV AIDER_SHOW_RELEASE_NOTES=false
 ENV AIDER_SKIP_SANITY_CHECK_REPO=true
 ENV AIDER_CHAT_HISTORY_FILE=""
 ENV AIDER_INPUT_HISTORY_FILE=""
diff --git a/RESULT-aider-gpt-4.1-mini.md b/RESULT-aider-gpt-4.1-mini.md
diff --git a/RESULT-google-gemini-2.5-flash.md b/RESULT-google-gemini-2.5-flash.md
	#!/usr/bin/env bun

	import { spawn } from "bun";
	import { join } from "path";
	import { readdir } from "fs/promises";

	const CLAUDE_CODE_CONTAINER = "cli-agents-benchmark";

	const EXERCISM_PRACTICE_PATH = "exercism/typescript/exercises/practice";
	const SYSTEM_PROMPT = "'Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution.'";

	interface AgentResult {
	exercise: string;
	success: boolean;
	error?: string;
	duration: number;
	output?: string;
	}

	interface TestResult {
	exercise: string;
	agentSuccess: boolean;
	testSuccess: boolean;
	overallSuccess: boolean;
	agentError?: string;
	testError?: string;
	agentDuration: number;
	testDuration: number;
	totalDuration: number;
	}

	interface BenchmarkConfig {
	testCommand: string;
	agent: string;
	model: string;
	provider: string;
	verbose: boolean;
	}

	async function getPracticeExercises(): Promise<string[]> {
	const practiceDir = join(process.cwd(), EXERCISM_PRACTICE_PATH);
	const entries = await readdir(practiceDir, { withFileTypes: true });
	return entries
	.filter(entry => entry.isDirectory() && !entry.name.startsWith('.'))
	.map(entry => entry.name)
	.sort();
	}


	async function getTestFiles(exercisePath: string): Promise<string[]> {
	try {
	const exerciseDir = join(process.cwd(), exercisePath);
	const entries = await readdir(exerciseDir);
	return entries.filter(file => file.endsWith('.test.ts'));
	} catch (error) {
	console.warn(`Warning: Could not read test files from ${exercisePath}`);
	return [];
	}
	}

	function buildTestCommand(config: BenchmarkConfig): string {
	return config.testCommand;
	}

	function buildAgentCommand(config: BenchmarkConfig, exercisePath: string): string[] {
	const { agent, model, provider } = config;
	const baseArgs = ["docker", "run", "--rm", "-i"];

	if (agent === 'claude') {
	return [
	...baseArgs,
	"-e", "ANTHROPIC_API_KEY",
	"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
	"-w", "/workspace",
	CLAUDE_CODE_CONTAINER,
	"sh", "-c",
	`claude --dangerously-skip-permissions --model ${model} -p .docs/instructions.md --system-prompt "${SYSTEM_PROMPT}"`
	];
	} else if (agent === 'goose') {
	return [
	...baseArgs,
	"-e", `GOOSE_MODEL=${model}`,
	"-e", "OPENAI_API_KEY",
	"-e", "ANTHROPIC_API_KEY",
	"-e", "GOOGLE_API_KEY",
	"-e", `GOOSE_PROVIDER=${provider}`,
	"-e", "GOOSE_DISABLE_KEYRING=1",
	"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
	"-w", "/workspace",
	CLAUDE_CODE_CONTAINER,
	"sh", "-c",
	`goose run --with-builtin "developer" -i .docs/instructions.md --system "${SYSTEM_PROMPT}"`
	];
	} else if (agent === 'aider') {
	return [
	...baseArgs,
	"-e", "OPENAI_API_KEY",
	"-e", "ANTHROPIC_API_KEY",
	"-e", "GOOGLE_API_KEY",
	"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
	"-w", "/workspace",
	CLAUDE_CODE_CONTAINER,
	"sh", "-c",
	`aider --yes-always --no-auto-commits --message "${SYSTEM_PROMPT} $(cat .docs/instructions.md)" --file .ts --read .test.ts`
	];
	} else if (agent === 'codex') {
	return [
	...baseArgs,
	"-e", "OPENAI_API_KEY",
	"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
	"-w", "/workspace",
	CLAUDE_CODE_CONTAINER,
	"sh", "-c",
	`codex exec --full-auto --skip-git-repo-check -m ${model} "$(cat .docs/instructions.md)"`
	];
	} else {
	throw new Error(`Unknown agent: ${agent}`);
	}
	}

	async function runAgentPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
	const startTime = Date.now();

	try {
	const agentArgs = buildAgentCommand(config, exercisePath);

	// Add test files as read-only mounts
	const testFiles = await getTestFiles(exercisePath);
	const mountIndex = agentArgs.findIndex(arg => arg === CLAUDE_CODE_CONTAINER);

	testFiles.forEach(testFile => {
	agentArgs.splice(mountIndex, 0, "-v", `${join(process.cwd(), exercisePath, testFile)}:/workspace/${testFile}:ro`);
	});


	if (config.verbose) {
	console.log(`🤖 Agent command: ${agentArgs.join(" ")}`);
	}

	const proc = spawn(agentArgs);
	await proc.exited;

	const duration = Date.now() - startTime;
	const stdout = await new Response(proc.stdout).text();
	const stderr = await new Response(proc.stderr).text();

	if (proc.exitCode === 0) {
	console.log(`🤖 ${exercise} - Agent Success (${duration}ms)`);
	return { exercise, success: true, duration, output: stdout };
	} else {
	console.log(`🤖 ${exercise} - Agent Failed (${duration}ms)`);
	if (config.verbose) {
	console.log(` Agent STDOUT: ${stdout.slice(0, 500)}...`);
	console.log(` Agent STDERR: ${stderr.slice(0, 500)}...`);
	}
	return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
	}
	} catch (error) {
	const duration = Date.now() - startTime;
	const errorMsg = error instanceof Error ? error.message : String(error);
	console.log(`🤖 ${exercise} - Agent Error (${duration}ms): ${errorMsg}`);
	return { exercise, success: false, error: errorMsg, duration };
	}
	}

	async function resetExercise(exercisePath: string, verbose: boolean = false): Promise<void> {
	try {
	if (verbose) {
	console.log(`🔄 Resetting exercise: ${exercisePath}`);
	}

	const fullExercisePath = join(process.cwd(), exercisePath);
	const resetArgs = ["git", "-C", fullExercisePath, "checkout", "HEAD", "--", "."];
	const proc = spawn(resetArgs);
	await proc.exited;

	if (proc.exitCode !== 0) {
	const stderr = await new Response(proc.stderr).text();
	console.warn(`Warning: Failed to reset ${exercisePath}: ${stderr}`);
	} else if (verbose) {
	console.log(`✅ Successfully reset ${exercisePath}`);
	}
	} catch (error) {
	const errorMsg = error instanceof Error ? error.message : String(error);
	console.warn(`Warning: Git reset failed for ${exercisePath}: ${errorMsg}`);
	}
	}

	async function runTestPhase(config: BenchmarkConfig, exercise: string, exercisePath: string): Promise<AgentResult> {
	const startTime = Date.now();

	try {
	const testCommand = buildTestCommand(config);
	let testArgs: string[];

	testArgs = [
	"docker", "run", "--rm", "-i",
	"-v", `${join(process.cwd(), exercisePath)}:/workspace`,
	"-w", "/workspace",
	CLAUDE_CODE_CONTAINER,
	"sh", "-c", testCommand
	];

	if (config.verbose) {
	console.log(`🧪 Test command: ${testArgs.join(" ")}`);
	}

	const proc = spawn(testArgs);
	await proc.exited;

	const duration = Date.now() - startTime;
	const stdout = await new Response(proc.stdout).text();
	const stderr = await new Response(proc.stderr).text();

	if (proc.exitCode === 0) {
	console.log(`🧪 ${exercise} - Test Success (${duration}ms)`);
	return { exercise, success: true, duration, output: stdout };
	} else {
	console.log(`🧪 ${exercise} - Test Failed (${duration}ms)`);
	if (config.verbose) {
	console.log(` Test STDOUT: ${stdout.slice(0, 500)}...`);
	console.log(` Test STDERR: ${stderr.slice(0, 500)}...`);
	}
	return { exercise, success: false, error: `STDOUT: ${stdout}\nSTDERR: ${stderr}`, duration, output: stdout };
	}
	} catch (error) {
	const duration = Date.now() - startTime;
	const errorMsg = error instanceof Error ? error.message : String(error);
	console.log(`🧪 ${exercise} - Test Error (${duration}ms): ${errorMsg}`);
	return { exercise, success: false, error: errorMsg, duration };
	}
	}

	async function runExercise(config: BenchmarkConfig, exercise: string): Promise<TestResult> {
	const startTime = Date.now();
	const exercisePath = join(EXERCISM_PRACTICE_PATH, exercise);

	console.log(`🧪 Starting ${exercise}... (Docker)`);

	// Phase 0: Reset exercise to clean state
	await resetExercise(exercisePath, config.verbose);

	// Phase 1: Run AI Agent
	const agentResult = await runAgentPhase(config, exercise, exercisePath);

	// Phase 2: Run Tests (always run, even if agent failed)
	const testResult = await runTestPhase(config, exercise, exercisePath);

	const totalDuration = Date.now() - startTime;
	const overallSuccess = agentResult.success && testResult.success;

	if (overallSuccess) {
	console.log(`✅ ${exercise} - Overall Success (${totalDuration}ms)`);
	} else {
	console.log(`❌ ${exercise} - Overall Failed (${totalDuration}ms)`);
	if (!agentResult.success) console.log(` 🤖 Agent failed: ${agentResult.error?.slice(0, 200)}...`);
	if (!testResult.success) console.log(` 🧪 Test failed: ${testResult.error?.slice(0, 200)}...`);
	}

	return {
	exercise,
	agentSuccess: agentResult.success,
	testSuccess: testResult.success,
	overallSuccess,
	agentError: agentResult.error,
	testError: testResult.error,
	agentDuration: agentResult.duration,
	testDuration: testResult.duration,
	totalDuration
	};
	}

	async function runBenchmark(): Promise<void> {

	const modelIndex = process.argv.indexOf('--model');
	const model = modelIndex !== -1 && modelIndex + 1 < process.argv.length
	? process.argv[modelIndex + 1]
	: 'sonnet';

	const agentIndex = process.argv.indexOf('--agent');
	const agent = agentIndex !== -1 && agentIndex + 1 < process.argv.length
	? process.argv[agentIndex + 1]
	: 'claude';

	const providerIndex = process.argv.indexOf('--provider');
	const provider = providerIndex !== -1 && providerIndex + 1 < process.argv.length
	? process.argv[providerIndex + 1]
	: 'openai';

	const verbose = process.argv.includes('--verbose');

	const exerciseIndex = process.argv.indexOf('--exercise');
	let specificExercise = exerciseIndex !== -1 && exerciseIndex + 1 < process.argv.length
	? process.argv[exerciseIndex + 1]
	: null;

	let exerciseCount: number \| null = null;

	if (specificExercise && /^\d+$/.test(specificExercise)) {
	exerciseCount = parseInt(specificExercise, 10);
	specificExercise = null;
	}
	else if (specificExercise && specificExercise.includes('/')) {
	specificExercise = specificExercise.split('/').pop() \|\| null;
	}

	const listExercises = process.argv.includes('--list');

	const allExercises = await getPracticeExercises();

	if (listExercises) {
	console.log("📋 Available Exercism problems:");
	allExercises.forEach((exercise, index) => {
	console.log(` ${(index + 1).toString().padStart(3)}: ${exercise}`);
	});
	return;
	}

	console.log("🚀 Starting Exercism TypeScript benchmark");
	console.log(`📋 Solving TypeScript problems with ${agent} agent (Docker mode, ${model} model)\n`);

	let exercises: string[];
	if (specificExercise) {
	if (!allExercises.includes(specificExercise)) {
	console.error(`❌ Specified problem '${specificExercise}' not found`);
	console.log("Use --list option to see available problems");
	return;
	}
	exercises = [specificExercise];
	console.log(`🎯 Specified problem: ${specificExercise}\n`);
	} else if (exerciseCount) {
	const count = Math.min(exerciseCount, allExercises.length);
	exercises = allExercises.slice(0, count);
	console.log(`🔢 Number of problems: ${count} (out of ${allExercises.length})\n`);
	} else {
	exercises = allExercises.slice(0, 1);
	console.log(`📊 Found problems: ${allExercises.length} (testing only the first one)\n`);
	}

	const results: TestResult[] = [];

	const config: BenchmarkConfig = {
	testCommand: 'yarn && yarn test',
	agent,
	model,
	provider,
	verbose
	};

	for (const exercise of exercises) {
	const result = await runExercise(config, exercise);
	results.push(result);
	await new Promise(resolve => setTimeout(resolve, 1000));
	}

	const successCount = results.filter(r => r.overallSuccess).length;
	const totalCount = results.length;
	const successRate = (successCount / totalCount) * 100;
	const avgDuration = results.reduce((sum, r) => sum + r.totalDuration, 0) / results.length;
	const agentSuccessCount = results.filter(r => r.agentSuccess).length;
	const testSuccessCount = results.filter(r => r.testSuccess).length;

	console.log("\n" + "=".repeat(50));
	console.log("📈 Benchmark Results");
	console.log("=".repeat(50));
	console.log(`🎯 Success Rate: ${successRate.toFixed(1)}% (${successCount}/${totalCount})`);
	console.log(`⏱️ Average Duration: ${avgDuration.toFixed(0)}ms`);
	console.log(`✅ Overall Success: ${successCount}`);
	console.log(`🤖 Agent Success: ${agentSuccessCount}`);
	console.log(`🧪 Test Success: ${testSuccessCount}`);
	console.log(`❌ Failed: ${totalCount - successCount}`);

	console.log("\n📝 Detailed Results:");
	results.forEach(result => {
	const overallStatus = result.overallSuccess ? "✅" : "❌";
	const agentStatus = result.agentSuccess ? "🤖" : "❌";
	const testStatus = result.testSuccess ? "🧪" : "❌";
	const duration = `${result.totalDuration}ms`;
	console.log(` ${overallStatus} ${result.exercise.padEnd(25)} ${duration} (${agentStatus}${testStatus})`);
	});

	if (results.some(r => !r.overallSuccess)) {
	console.log("\n🔍 Errors for failed problems:");
	results.filter(r => !r.overallSuccess).forEach(result => {
	console.log(` ❌ ${result.exercise}:`);
	if (result.agentError) {
	console.log(` 🤖 Agent: ${result.agentError.slice(0, 500)}${result.agentError.length > 500 ? '...' : ''}`);
	}
	if (result.testError) {
	console.log(` 🧪 Test: ${result.testError.slice(0, 500)}${result.testError.length > 500 ? '...' : ''}`);
	}
	});
	}
	}

	if (import.meta.main) {
	runBenchmark().catch(console.error);
	}
	FROM node:22

	ARG TZ
	ENV TZ="$TZ"

	# Install basic development tools and iptables/ipset
	RUN apt update && apt install -y less \
	git \
	procps \
	sudo \
	fzf \
	zsh \
	man-db \
	unzip \
	gnupg2 \
	gh \
	iptables \
	ipset \
	iproute2 \
	dnsutils \
	aggregate \
	ripgrep \
	jq

	# Ensure default node user has access to /usr/local/share
	RUN mkdir -p /usr/local/share/npm-global && \
	chown -R node:node /usr/local/share

	ARG USERNAME=node

	WORKDIR /workspace
	RUN mkdir -p /workspace && \
	chown -R node:node /workspace

	# Enable corepack for yarn version management (as root)
	ENV COREPACK_ENABLE_DOWNLOAD_PROMPT=0
	RUN corepack enable && corepack prepare yarn@stable --activate
	# Set up yarn and corepack environment
	ENV YARN_CACHE_FOLDER=/home/node/.yarn/cache
	ENV YARN_GLOBAL_FOLDER=/home/node/.yarn/global
	RUN mkdir -p /home/node/.yarn/cache /home/node/.yarn/global /home/node/.cache/node/corepack && \
	chown -R node:node /home/node/.yarn /home/node/.cache

	# Install global packages
	ENV NPM_CONFIG_PREFIX=/usr/local/share/npm-global
	ENV PATH=$PATH:/usr/local/share/npm-global/bin

	# Set up non-root user
	USER node

	# Install Claude Code
	RUN npm install -g @anthropic-ai/claude-code
	RUN mkdir -p /home/node/.claude

	# Install Codex CLI(Native)
	RUN npm install -g @openai/codex@native
	ENV CODEX_RUST=1
	RUN mkdir -p $HOME/.codex && \
	echo "Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution." > AGENTS.md

	# Install Goose CLI
	RUN curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh \| bash
	ENV HOME=/home/node
	ENV PATH=$HOME/.local/bin:$PATH

	# Install Aider
	RUN curl -LsSf https://aider.chat/install.sh \| sh
	ENV AIDER_GIT=false
	ENV AIDER_AUTO_COMMITS=false
	ENV AIDER_SHOW_RELEASE_NOTES=false
	ENV AIDER_SKIP_SANITY_CHECK_REPO=true
	ENV AIDER_CHAT_HISTORY_FILE=""
	ENV AIDER_INPUT_HISTORY_FILE=""