"""GPQA accuracy benchmark — scores the running server on multiple-choice Q&A.

Loads the GPQA dataset (https://github.com/idavidrein/gpqa — access-gated, you
must obtain the CSV yourself), turns each row into a shuffled A/B/C/D question,
sends it to the OpenAI-compatible server one at a time, parses the answer
letter, and reports accuracy.

    # 1. Start the server in another terminal
    python app.py

    # 2. Validate parsing on a few questions before a full run
    python tests/gpqa_bench.py path/to/gpqa_diamond.csv --limit 10

    # 3. Full run, writing a per-question log + final score to JSON
    python tests/gpqa_bench.py path/to/gpqa_diamond.csv --out results.json

Notes specific to this project:
  * The server serializes upstream Copilot calls behind a single lock
    (see server/api.py), so this runs strictly sequentially. A --delay between
    questions keeps things gentle on your account; please don't hammer it.
  * Consumer Copilot browses the web, so these scores measure
    "Copilot-with-search", not a closed-book model — not comparable to the
    GPQA paper's numbers. Label your results accordingly.
  * Answers are scored by extracting the letter from free text, so a refusal
    or off-format reply counts as wrong (logged as picked="?").
"""

import argparse
import csv
import json
import random
import re
import time
import urllib.error
import urllib.request

PROMPT = """Answer the following multiple-choice question. Respond with ONLY the \
letter (A, B, C, or D) of the correct option on the first line.

{q}

A) {a}
B) {b}
C) {c}
D) {d}"""

LETTERS = "ABCD"

# Column names in the GPQA CSVs. Override on the command line if your copy
# differs (some exports prefix or rename these).
COL_QUESTION = "Question"
COL_CORRECT = "Correct Answer"
COL_INCORRECT = ["Incorrect Answer 1", "Incorrect Answer 2", "Incorrect Answer 3"]


def build_question(row, rng):
    """Return (prompt, correct_letter) for one CSV row, options shuffled."""
    correct = row[COL_CORRECT].strip()
    options = [correct] + [row[c].strip() for c in COL_INCORRECT]
    rng.shuffle(options)
    correct_letter = LETTERS[options.index(correct)]
    prompt = PROMPT.format(
        q=row[COL_QUESTION].strip(),
        a=options[0], b=options[1], c=options[2], d=options[3],
    )
    return prompt, correct_letter


def parse_letter(text):
    """Extract the chosen letter from the reply, or '?' if none is found.

    Prefers a letter on its own / at the very start (e.g. "A", "A)", "A."),
    and falls back to the first standalone A-D anywhere in the text.
    """
    head = text.strip()
    m = re.match(r"\s*\(?([ABCD])\b", head)
    if m:
        return m.group(1)
    m = re.search(r"\b([ABCD])\b", head)
    return m.group(1) if m else "?"


def ask(endpoint, timeout, prompt):
    """Send one chat completion. Returns (reply_text, detail_or_none)."""
    body = json.dumps({
        "model": "copilot",
        "messages": [{"role": "user", "content": prompt}],
    }).encode("utf-8")
    req = urllib.request.Request(
        endpoint,
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            payload = json.loads(resp.read().decode("utf-8"))
        if "error" in payload:
            return None, f"error payload: {payload['error']}"
        return payload["choices"][0]["message"]["content"], None
    except urllib.error.HTTPError as exc:
        return None, f"HTTP {exc.code}: {exc.reason}"
    except Exception as exc:  # timeout, connection reset, malformed body, ...
        return None, f"{type(exc).__name__}: {exc}"


def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("csv", help="Path to the GPQA CSV file")
    parser.add_argument(
        "--url", default="http://localhost:8000",
        help="Server base URL (default: http://localhost:8000)",
    )
    parser.add_argument(
        "--limit", type=int, default=0,
        help="Only run the first N questions (default: 0 = all)",
    )
    parser.add_argument(
        "--delay", type=float, default=2.0,
        help="Seconds to wait between questions (default: 2.0)",
    )
    parser.add_argument(
        "--timeout", type=float, default=180,
        help="Per-request timeout in seconds (default: 180)",
    )
    parser.add_argument(
        "--seed", type=int, default=0,
        help="RNG seed for option shuffling, for reproducibility (default: 0)",
    )
    parser.add_argument(
        "--out", default=None,
        help="Write per-question log + final score to this JSON file",
    )
    args = parser.parse_args()

    endpoint = args.url.rstrip("/") + "/v1/chat/completions"
    rng = random.Random(args.seed)

    with open(args.csv, encoding="utf-8") as f:
        rows = list(csv.DictReader(f))
    if not rows:
        parser.error(f"No rows found in {args.csv}")
    missing = [c for c in [COL_QUESTION, COL_CORRECT, *COL_INCORRECT]
               if c not in rows[0]]
    if missing:
        parser.error(
            f"CSV is missing expected column(s): {missing}. "
            f"Found columns: {list(rows[0])}"
        )
    if args.limit:
        rows = rows[:args.limit]

    print(f"GPQA benchmark against {endpoint}")
    print(f"{len(rows)} question(s), seed={args.seed}, delay={args.delay}s\n")

    log = []
    correct = 0
    start = time.perf_counter()
    try:
        for i, row in enumerate(rows, 1):
            prompt, gold = build_question(row, rng)
            reply, detail = ask(endpoint, args.timeout, prompt)
            picked = parse_letter(reply) if reply is not None else "?"
            ok = picked == gold
            correct += ok

            log.append({
                "index": i,
                "picked": picked,
                "gold": gold,
                "correct": ok,
                "error": detail,
                "reply": (reply or "").strip()[:500],
            })
            flag = "✓" if ok else "✗"
            note = f"  ({detail})" if detail else ""
            print(f"[{i}/{len(rows)}] {flag} picked={picked} gold={gold}  "
                  f"acc={correct / i:.1%}{note}")

            if i < len(rows):
                time.sleep(args.delay)
    except KeyboardInterrupt:
        print("\nInterrupted — reporting partial results.")

    answered = len(log)
    wall = time.perf_counter() - start
    accuracy = correct / answered if answered else 0.0
    print(f"\nFinal: {correct}/{answered} = {accuracy:.1%}  "
          f"(wall {wall:.0f}s)")

    if args.out:
        summary = {
            "csv": args.csv,
            "endpoint": endpoint,
            "seed": args.seed,
            "answered": answered,
            "correct": correct,
            "accuracy": accuracy,
            "wall_seconds": round(wall, 1),
            "results": log,
        }
        with open(args.out, "w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)
        print(f"Wrote {args.out}")


if __name__ == "__main__":
    main()