{ "date": "2026-05-18", "tools": [ "caveman", "codex" ], "tasks": 25, "config": { "caveman": { "provider": "openai-codex", "model": "gpt-5.5", "thinking": "xhigh" } }, "aggregate": { "caveman": { "tool": "caveman", "tasks": 25, "resolved": 14, "rate": 0.56, "tokens_total": 524703, "cost_total": 1.7815035000000001 }, "codex": { "tool": "codex", "tasks": 25, "resolved": 15, "rate": 0.6, "tokens_total": 1010185, "cost_total": 0 } }, "results": [ { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-py-01-add-docstrings", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 19683, "tokens_fresh": 26983, "tokens_input": 26550, "tokens_output": 433, "tokens_cache_read": 12288, "tokens_cache_write": 0, "cost_usd": 0.07594200000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-py-01-add-docstrings.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-py-01-add-docstrings", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 49016, "tokens_fresh": 22768, "tokens_input": 22768, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-py-01-add-docstrings.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-py-02-fix-assertion", "difficulty": "easy", "language": "python", "resolved": false, "duration_ms": 17376, "tokens_fresh": 17023, "tokens_input": 16523, "tokens_output": 500, "tokens_cache_read": 41472, "tokens_cache_write": 0, "cost_usd": 0.0591755, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-py-02-fix-assertion.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-py-02-fix-assertion", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 30656, "tokens_fresh": 40422, "tokens_input": 40422, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-py-02-fix-assertion.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-py-03-fix-import", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 17516, "tokens_fresh": 14709, "tokens_input": 14502, "tokens_output": 207, "tokens_cache_read": 50176, "tokens_cache_write": 0, "cost_usd": 0.051904, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-py-03-fix-import.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-py-03-fix-import", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 34442, "tokens_fresh": 41924, "tokens_input": 41924, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-py-03-fix-import.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-py-04-type-hints", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 13469, "tokens_fresh": 25002, "tokens_input": 24719, "tokens_output": 283, "tokens_cache_read": 13824, "tokens_cache_write": 0, "cost_usd": 0.0694985, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-py-04-type-hints.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-py-04-type-hints", "difficulty": "easy", "language": "python", "resolved": true, "duration_ms": 31360, "tokens_fresh": 39901, "tokens_input": 39901, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-py-04-type-hints.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-ts-01-add-jsdoc", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 14306, "tokens_fresh": 14530, "tokens_input": 14104, "tokens_output": 426, "tokens_cache_read": 24576, "tokens_cache_write": 0, "cost_usd": 0.047794, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-ts-01-add-jsdoc.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-ts-01-add-jsdoc", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 29026, "tokens_fresh": 39409, "tokens_input": 39409, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-ts-01-add-jsdoc.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-ts-02-fix-type-error", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 18653, "tokens_fresh": 14003, "tokens_input": 13871, "tokens_output": 132, "tokens_cache_read": 24576, "tokens_cache_write": 0, "cost_usd": 0.042801500000000006, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-ts-02-fix-type-error.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-ts-02-fix-type-error", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 114598, "tokens_fresh": 82732, "tokens_input": 82732, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-ts-02-fix-type-error.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-ts-03-add-error-handling", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 22065, "tokens_fresh": 15200, "tokens_input": 14455, "tokens_output": 745, "tokens_cache_read": 24576, "tokens_cache_write": 0, "cost_usd": 0.053456500000000004, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-ts-03-add-error-handling.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-ts-03-add-error-handling", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 96710, "tokens_fresh": 24674, "tokens_input": 24674, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-ts-03-add-error-handling.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "easy-ts-04-rename-variable", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 9635, "tokens_fresh": 14281, "tokens_input": 13998, "tokens_output": 283, "tokens_cache_read": 24576, "tokens_cache_write": 0, "cost_usd": 0.045384, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/easy-ts-04-rename-variable.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "easy-ts-04-rename-variable", "difficulty": "easy", "language": "typescript", "resolved": true, "duration_ms": 38769, "tokens_fresh": 41425, "tokens_input": 41425, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/easy-ts-04-rename-variable.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-py-01-async-refactor", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 60672, "tokens_fresh": 21206, "tokens_input": 20186, "tokens_output": 1020, "tokens_cache_read": 76800, "tokens_cache_write": 0, "cost_usd": 0.08496500000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-py-01-async-refactor.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-py-01-async-refactor", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 66447, "tokens_fresh": 35336, "tokens_input": 35336, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-py-01-async-refactor.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-py-02-optimize-algorithm", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 29290, "tokens_fresh": 22046, "tokens_input": 21110, "tokens_output": 936, "tokens_cache_read": 40448, "tokens_cache_write": 0, "cost_usd": 0.07692700000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-py-02-optimize-algorithm.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-py-02-optimize-algorithm", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 179073, "tokens_fresh": 88563, "tokens_input": 88563, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-py-02-optimize-algorithm.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-py-03-decorator-pattern", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 19708, "tokens_fresh": 17198, "tokens_input": 16622, "tokens_output": 576, "tokens_cache_read": 38912, "tokens_cache_write": 0, "cost_usd": 0.059923000000000004, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-py-03-decorator-pattern.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-py-03-decorator-pattern", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 48019, "tokens_fresh": 32883, "tokens_input": 32883, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-py-03-decorator-pattern.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-py-04-concurrent-fix", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 39783, "tokens_fresh": 37568, "tokens_input": 36248, "tokens_output": 1320, "tokens_cache_read": 78336, "tokens_cache_write": 0, "cost_usd": 0.130004, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-py-04-concurrent-fix.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-py-04-concurrent-fix", "difficulty": "hard", "language": "python", "resolved": false, "duration_ms": 60622, "tokens_fresh": 44832, "tokens_input": 44832, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-py-04-concurrent-fix.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-ts-01-generic-refactor", "difficulty": "hard", "language": "typescript", "resolved": false, "duration_ms": 29436, "tokens_fresh": 14570, "tokens_input": 13621, "tokens_output": 949, "tokens_cache_read": 39936, "tokens_cache_write": 0, "cost_usd": 0.058271500000000004, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-ts-01-generic-refactor.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-ts-01-generic-refactor", "difficulty": "hard", "language": "typescript", "resolved": false, "duration_ms": 59409, "tokens_fresh": 46576, "tokens_input": 46576, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-ts-01-generic-refactor.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-ts-02-state-machine", "difficulty": "hard", "language": "typescript", "resolved": false, "duration_ms": 36098, "tokens_fresh": 16882, "tokens_input": 16204, "tokens_output": 678, "tokens_cache_read": 64000, "tokens_cache_write": 0, "cost_usd": 0.06668, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-ts-02-state-machine.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-ts-02-state-machine", "difficulty": "hard", "language": "typescript", "resolved": false, "duration_ms": 59855, "tokens_fresh": 25869, "tokens_input": 25869, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-ts-02-state-machine.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "hard-ts-03-multi-file-refactor", "difficulty": "hard", "language": "typescript", "resolved": true, "duration_ms": 29262, "tokens_fresh": 33583, "tokens_input": 32757, "tokens_output": 826, "tokens_cache_read": 40448, "tokens_cache_write": 0, "cost_usd": 0.1043945, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/hard-ts-03-multi-file-refactor.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "hard-ts-03-multi-file-refactor", "difficulty": "hard", "language": "typescript", "resolved": true, "duration_ms": 98176, "tokens_fresh": 57220, "tokens_input": 57220, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/hard-ts-03-multi-file-refactor.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-py-01-refactor-class", "difficulty": "medium", "language": "python", "resolved": true, "duration_ms": 24321, "tokens_fresh": 27999, "tokens_input": 27285, "tokens_output": 714, "tokens_cache_read": 26112, "tokens_cache_write": 0, "cost_usd": 0.0854505, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-py-01-refactor-class.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-py-01-refactor-class", "difficulty": "medium", "language": "python", "resolved": true, "duration_ms": 55336, "tokens_fresh": 34283, "tokens_input": 34283, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-py-01-refactor-class.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-py-02-fix-bug-with-test", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 32655, "tokens_fresh": 32834, "tokens_input": 32385, "tokens_output": 449, "tokens_cache_read": 62464, "tokens_cache_write": 0, "cost_usd": 0.10331350000000002, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-py-02-fix-bug-with-test.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-py-02-fix-bug-with-test", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 51976, "tokens_fresh": 25740, "tokens_input": 25740, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-py-02-fix-bug-with-test.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-py-03-add-cli-arg", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 11723, "tokens_fresh": 14465, "tokens_input": 14222, "tokens_output": 243, "tokens_cache_read": 24576, "tokens_cache_write": 0, "cost_usd": 0.04534400000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-py-03-add-cli-arg.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-py-03-add-cli-arg", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 61158, "tokens_fresh": 42617, "tokens_input": 42617, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-py-03-add-cli-arg.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-py-04-fix-json-parser", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 15255, "tokens_fresh": 16218, "tokens_input": 15760, "tokens_output": 458, "tokens_cache_read": 37888, "tokens_cache_write": 0, "cost_usd": 0.05574200000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-py-04-fix-json-parser.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-py-04-fix-json-parser", "difficulty": "medium", "language": "python", "resolved": false, "duration_ms": 45857, "tokens_fresh": 26265, "tokens_input": 26265, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-py-04-fix-json-parser.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-01-extract-function", "difficulty": "medium", "language": "typescript", "resolved": false, "duration_ms": 29663, "tokens_fresh": 16188, "tokens_input": 15260, "tokens_output": 928, "tokens_cache_read": 38400, "tokens_cache_write": 0, "cost_usd": 0.06167000000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-01-extract-function.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-01-extract-function", "difficulty": "medium", "language": "typescript", "resolved": false, "duration_ms": 54324, "tokens_fresh": 15207, "tokens_input": 15207, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-01-extract-function.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-02-replace-api", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 62031, "tokens_fresh": 38137, "tokens_input": 36239, "tokens_output": 1898, "tokens_cache_read": 92160, "tokens_cache_write": 0, "cost_usd": 0.14210750000000003, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-02-replace-api.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-02-replace-api", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 92378, "tokens_fresh": 54571, "tokens_input": 54571, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-02-replace-api.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-03-fix-failing-test", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 21543, "tokens_fresh": 15893, "tokens_input": 15522, "tokens_output": 371, "tokens_cache_read": 73216, "tokens_cache_write": 0, "cost_usd": 0.06267400000000001, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-03-fix-failing-test.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-03-fix-failing-test", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 147714, "tokens_fresh": 56068, "tokens_input": 56068, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-03-fix-failing-test.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-04-create-module", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 18403, "tokens_fresh": 13000, "tokens_input": 12634, "tokens_output": 366, "tokens_cache_read": 26112, "tokens_cache_write": 0, "cost_usd": 0.043603, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-04-create-module.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-04-create-module", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 65255, "tokens_fresh": 26164, "tokens_input": 26164, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-04-create-module.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-05-implement-interface", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 29226, "tokens_fresh": 17349, "tokens_input": 16518, "tokens_output": 831, "tokens_cache_read": 50176, "tokens_cache_write": 0, "cost_usd": 0.066304, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-05-implement-interface.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-05-implement-interface", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 35063, "tokens_fresh": 40523, "tokens_input": 40523, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-05-implement-interface.log" }, { "tool": "caveman", "provider": "openai-codex", "model": "gpt-5.5", "task_id": "medium-ts-06-wire-event-handler", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 24976, "tokens_fresh": 27836, "tokens_input": 27148, "tokens_output": 688, "tokens_cache_read": 39936, "tokens_cache_write": 0, "cost_usd": 0.08817400000000002, "config_fingerprint": "rtk=on,cave-mode=ultra,tool-compression=on,ml=on,model=gpt-5.5,thinking=xhigh", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/caveman/medium-ts-06-wire-event-handler.log" }, { "tool": "codex", "provider": null, "model": "gpt-5.5", "task_id": "medium-ts-06-wire-event-handler", "difficulty": "medium", "language": "typescript", "resolved": true, "duration_ms": 52248, "tokens_fresh": 24213, "tokens_input": 24213, "tokens_output": 0, "tokens_cache_read": null, "tokens_cache_write": null, "cost_usd": null, "config_fingerprint": "sandbox=workspace-write,model=gpt-5.5", "error": null, "raw_log_path": "/Users/julb/Desktop/GitHub/caveman-cli/research/results/honest-bench-2026-05-18/codex/medium-ts-06-wire-event-handler.log" } ] }