"""
YiRage Kernel Submission Example — compatible with gpu-mode/popcorn-cli.

This file demonstrates a grayscale image conversion kernel that can be
submitted to the gpu-mode leaderboard (grayscale_v2).  It also shows the
YiRage workflow for quick optimization-effect validation before submission.

Usage
-----
1. Install YiRage:
       YIRAGE_BACKEND=cpu pip install -e . --no-build-isolation

2. Validate locally (no GPU required):
       python examples/submission.py --validate

3. Install popcorn-cli and submit:
       curl -fsSL https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/install.sh | bash
       popcorn-cli register discord
       popcorn-cli setup
       popcorn-cli submit --gpu A100 --leaderboard grayscale_v2 --mode leaderboard examples/submission.py
"""

from __future__ import annotations

import argparse
import time

import numpy as np

# ---------------------------------------------------------------------------
# Kernel implementation
# ---------------------------------------------------------------------------

try:
    import torch

    TORCH_AVAILABLE = True
except ImportError:  # pragma: no cover
    TORCH_AVAILABLE = False


def grayscale_numpy(image: np.ndarray) -> np.ndarray:
    """Pure-NumPy grayscale conversion (ITU-R BT.601 luma coefficients).

    Parameters
    ----------
    image:
        RGB image array with shape (H, W, 3) and dtype uint8 or float32.

    Returns
    -------
    np.ndarray
        Grayscale image with shape (H, W) and the same dtype as *image*.
    """
    coeffs = np.array([0.299, 0.587, 0.114], dtype=np.float32)
    return (image.astype(np.float32) @ coeffs).astype(image.dtype)


def grayscale_torch(image: "torch.Tensor") -> "torch.Tensor":
    """Torch-based grayscale conversion — auto-selects best available device.

    Parameters
    ----------
    image:
        RGB tensor with shape (B, 3, H, W) or (3, H, W), dtype float32.

    Returns
    -------
    torch.Tensor
        Grayscale tensor with shape (B, 1, H, W) or (1, H, W).
    """
    if not TORCH_AVAILABLE:
        raise RuntimeError("PyTorch is not installed.")

    coeffs = torch.tensor([0.299, 0.587, 0.114], dtype=torch.float32, device=image.device)

    if image.dim() == 3:
        # (3, H, W) → (1, H, W)
        return (image * coeffs[:, None, None]).sum(dim=0, keepdim=True)
    elif image.dim() == 4:
        # (B, 3, H, W) → (B, 1, H, W)
        return (image * coeffs[None, :, None, None]).sum(dim=1, keepdim=True)
    else:
        raise ValueError(f"Expected 3-D or 4-D tensor, got {image.dim()}-D")


# ---------------------------------------------------------------------------
# popcorn-cli entry point — must be named `solution`
# ---------------------------------------------------------------------------


def solution(input_tensor: "torch.Tensor") -> "torch.Tensor":
    """Leaderboard entry-point expected by popcorn-cli.

    Parameters
    ----------
    input_tensor:
        RGB float32 tensor with shape (B, 3, H, W) on CUDA.

    Returns
    -------
    torch.Tensor
        Grayscale float32 tensor with shape (B, 1, H, W).
    """
    return grayscale_torch(input_tensor)


# ---------------------------------------------------------------------------
# Local validation helpers
# ---------------------------------------------------------------------------


def _benchmark(fn, *args, warmup: int = 10, iters: int = 100) -> float:
    """Return mean wall-clock time (ms) over *iters* calls after *warmup*."""
    import torch as _torch  # local import; safe — only called when torch is available

    def _sync() -> None:
        if TORCH_AVAILABLE and isinstance(args[0], _torch.Tensor) and args[0].is_cuda:
            _torch.cuda.synchronize()

    for _ in range(warmup):
        fn(*args)
    _sync()

    t0 = time.perf_counter()
    for _ in range(iters):
        fn(*args)
    _sync()

    return (time.perf_counter() - t0) / iters * 1000.0


def _validate_numpy(height: int = 256, width: int = 256) -> None:
    """Validate correctness and measure throughput with NumPy."""
    rng = np.random.default_rng(42)
    image = rng.integers(0, 256, (height, width, 3), dtype=np.uint8)

    result = grayscale_numpy(image)
    reference = (
        image[:, :, 0].astype(np.float32) * 0.299
        + image[:, :, 1].astype(np.float32) * 0.587
        + image[:, :, 2].astype(np.float32) * 0.114
    ).astype(np.uint8)

    assert result.shape == (height, width), f"Shape mismatch: {result.shape}"
    np.testing.assert_array_equal(result, reference)
    print(f"  ✅  NumPy kernel: shape={result.shape}, dtype={result.dtype}")

    img_f32 = image.astype(np.float32)
    elapsed_ms = _benchmark(grayscale_numpy, img_f32, warmup=5, iters=50)
    throughput_gpix_s = (height * width) / elapsed_ms / 1e6
    print(f"  ⏱   NumPy throughput: {elapsed_ms:.3f} ms/frame  ({throughput_gpix_s:.2f} Gpix/s)")


def _validate_torch(height: int = 256, width: int = 256) -> None:
    """Validate correctness and measure throughput with PyTorch."""
    if not TORCH_AVAILABLE:
        print("  ⚠️   PyTorch not available — skipping Torch validation")
        return

    import torch

    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    print(f"  🖥   Using device: {device}")

    rng = torch.Generator()
    rng.manual_seed(42)
    image = torch.rand(4, 3, height, width, generator=rng, device=device)

    result = grayscale_torch(image)
    assert result.shape == (4, 1, height, width), f"Shape mismatch: {result.shape}"

    # Reference using torch.einsum
    coeffs = torch.tensor([0.299, 0.587, 0.114], device=device)
    reference = torch.einsum("bchw,c->bhw", image, coeffs).unsqueeze(1)
    torch.testing.assert_close(result, reference, atol=1e-5, rtol=1e-5)
    print(f"  ✅  Torch kernel: shape={result.shape}, dtype={result.dtype}, device={device}")

    elapsed_ms = _benchmark(grayscale_torch, image, warmup=10, iters=100)
    throughput_gpix_s = (4 * height * width) / elapsed_ms / 1e6
    print(f"  ⏱   Torch throughput: {elapsed_ms:.3f} ms/batch  ({throughput_gpix_s:.2f} Gpix/s)")


# ---------------------------------------------------------------------------
# YiRage optimization integration
# ---------------------------------------------------------------------------


def _try_yirage_optimize(height: int = 256, width: int = 256) -> None:
    """Attempt to further optimize the kernel using YiRage superoptimizer.

    This is optional — if yirage is not installed or the native runtime is
    unavailable, a clear message is printed and execution continues.
    """
    try:
        import yirage as yr  # noqa: F401

        print("\n  🚀  YiRage superoptimizer detected — running kernel search …")
        # Placeholder: real usage would call yr.superoptimize() on a kernel graph.
        # See docs/tutorial.md and examples/cluster/ for complete examples.
        print("  ℹ️   To superoptimize, build a kernel graph with yr.new_kernel_graph()")
        print("      and call yr.superoptimize(graph, backend='cuda').")
    except ImportError:
        print("\n  ℹ️   YiRage not installed — skipping superoptimizer pass.")
        print("      Install with: YIRAGE_BACKEND=cpu pip install -e . --no-build-isolation")


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main() -> None:
    parser = argparse.ArgumentParser(
        description="YiRage grayscale kernel — local validation & leaderboard submission helper"
    )
    parser.add_argument(
        "--validate",
        action="store_true",
        help="Run correctness checks and benchmark locally",
    )
    parser.add_argument("--height", type=int, default=256, help="Image height for benchmarks")
    parser.add_argument("--width", type=int, default=256, help="Image width for benchmarks")
    args = parser.parse_args()

    if args.validate:
        print("\n🔍  YiRage Kernel Validation")
        print("=" * 48)
        print("\n[1/3] NumPy backend")
        _validate_numpy(args.height, args.width)
        print("\n[2/3] PyTorch backend")
        _validate_torch(args.height, args.width)
        print("\n[3/3] YiRage superoptimizer")
        _try_yirage_optimize(args.height, args.width)
        print("\n✅  All validation steps completed.")
        print("\nNext step — submit to the leaderboard:")
        print(
            "  popcorn-cli submit --gpu A100 --leaderboard grayscale_v2 "
            "--mode leaderboard examples/submission.py"
        )
    else:
        parser.print_help()


if __name__ == "__main__":
    main()