"""Tests for execution/providers/gemini_vision.py (CP-9 Phase 2).

All transport is mocked via the `transport=` kwarg. NO live API calls.

Coverage maps to BUILD_SPEC § Test surface and audit § 12a corrections:
    success path (image + video + audio); 401 / 403 / 404 / 400 / 402;
    429 (with retries + exhaustion); 500 (with retries + exhaustion);
    503 (longer-backoff retries); 504 (2x retries); URLError /
    socket.timeout (with retries); response not JSON (top-level);
    verdict text not JSON; verdict missing score / reasoning; score
    clipping (high + low); MAX_TOKENS clean parse (warning); MAX_TOKENS
    unparseable (raises with truncated_unparseable); missing
    candidatesTokenCount (totalTokenCount fallback); missing both
    (warning + cost=0); 20 MB ceiling (raises EvalPayloadError mentioning
    Files API); Retry-After header honored; exception chaining
    (`raise X from original_exc`); auth header precedence (GEMINI vs
    GOOGLE); judge_id thread-through; camelCase request body keys;
    base64 standard (NOT urlsafe).
"""

from __future__ import annotations

import base64
import json
import socket
import urllib.error
from typing import Optional

import pytest

from recoil.execution.providers import gemini_vision as gv
from recoil.execution.providers.gemini_vision import (
    DEFAULT_MODEL_ID,
    EvalAuthError,
    EvalNetworkError,
    EvalPayloadError,
    EvalProviderError,
    EvalProviderResult,
    EvalQuotaError,
    EvalRateLimitError,
    EvalServerError,
    PRIMARY_AUTH_ENV_VAR,
    score_artifact,
)


# ---------------------------------------------------------------------------
# Mock transport plumbing (mirrors test_audio_provider_elevenlabs.py)
# ---------------------------------------------------------------------------

class _MockResponse:
    """Mimics the urlopen-returned context-managed response object."""

    def __init__(
        self,
        *,
        status: int = 200,
        body: bytes = b"",
        headers: Optional[dict] = None,
    ):
        self.status = status
        self._body = body
        self.headers = headers or {}

    def read(self) -> bytes:
        return self._body

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc, tb):
        return False


class _MockTransport:
    """Records calls; returns queued responses or raises queued exceptions."""

    def __init__(self, *responses):
        self._responses = list(responses)
        self.calls: list[dict] = []
        self.call_count = 0

    def __call__(self, url, *, headers, body, timeout):
        self.call_count += 1
        self.calls.append({
            "url": url,
            "headers": dict(headers),
            "body": body,
            "timeout": timeout,
        })
        if not self._responses:
            raise AssertionError("MockTransport ran out of responses")
        nxt = self._responses.pop(0)
        if isinstance(nxt, BaseException):
            raise nxt
        return nxt


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

@pytest.fixture(autouse=True)
def _set_api_key(monkeypatch):
    monkeypatch.setenv("GEMINI_API_KEY", "test-key-abc")
    monkeypatch.delenv("GOOGLE_API_KEY", raising=False)


@pytest.fixture(autouse=True)
def _no_sleep(monkeypatch):
    """Skip backoff sleeps so the suite runs fast."""
    monkeypatch.setattr(gv.time, "sleep", lambda *_: None)


@pytest.fixture(autouse=True)
def _deterministic_jitter(monkeypatch):
    """Pin jitter to base value so retry counts are exact in assertions."""
    monkeypatch.setattr(gv.random, "uniform", lambda lo, hi: (lo + hi) / 2.0)


@pytest.fixture(autouse=True)
def _stub_model_profiles(monkeypatch):
    """Default model_profiles stub — Phase 2 must not depend on Phase 3's
    on-disk entry. Tests that need different rates patch this fixture."""
    import sys
    import types
    fake_module = types.ModuleType("recoil.core.model_profiles")

    def fake_get_profile(model_id):
        if model_id == DEFAULT_MODEL_ID:
            return {
                "cost_per_1k_input_tokens": 0.002,
                "cost_per_1k_output_tokens": 0.012,
                "cost_per_1k_input_tokens_long_context": 0.004,
                "cost_per_1k_output_tokens_long_context": 0.018,
                "long_context_threshold_tokens": 200_000,
            }
        raise KeyError(model_id)

    fake_module.get_profile = fake_get_profile
    monkeypatch.setitem(sys.modules, "recoil.core.model_profiles", fake_module)


@pytest.fixture
def small_png(tmp_path):
    p = tmp_path / "frame.png"
    # Minimal 1x1 PNG bytes (a real png header — guess_type returns image/png).
    p.write_bytes(
        bytes.fromhex(
            "89504e470d0a1a0a0000000d49484452"
            "00000001000000010806000000"
            "1f15c4890000000a49444154789c63"
            "00010000000500010d0a2db40000000049454e44ae426082"
        )
    )
    return p


@pytest.fixture
def small_mp4(tmp_path):
    p = tmp_path / "clip.mp4"
    p.write_bytes(b"\x00\x00\x00\x18ftypmp42" + b"\x00" * 64)
    return p


@pytest.fixture
def small_mp3(tmp_path):
    p = tmp_path / "clip.mp3"
    p.write_bytes(b"ID3\x03\x00\x00\x00\x00\x00\x00" + b"\xFF\xFB" + b"\x00" * 64)
    return p


# ---------------------------------------------------------------------------
# Helpers — synthesize Gemini-shaped response bodies / errors
# ---------------------------------------------------------------------------

def _ok_body(
    *,
    score: float = 0.85,
    reasoning: str = "Good composition. Lighting motivated. Production-grade.",
    finish: str = "STOP",
    prompt_tokens: int = 250,
    cand_tokens: Optional[int] = 80,
    total_tokens: Optional[int] = None,
    raw_text_override: Optional[str] = None,
    fence_wrapped: bool = False,
) -> bytes:
    if raw_text_override is not None:
        text = raw_text_override
    else:
        verdict = {"score": score, "reasoning": reasoning}
        text = json.dumps(verdict)
        if fence_wrapped:
            text = "```json\n" + text + "\n```"
    usage: dict = {"promptTokenCount": prompt_tokens}
    if cand_tokens is not None:
        usage["candidatesTokenCount"] = cand_tokens
    if total_tokens is not None:
        usage["totalTokenCount"] = total_tokens
    body = {
        "candidates": [{
            "content": {"role": "model", "parts": [{"text": text}]},
            "finishReason": finish,
            "index": 0,
        }],
        "usageMetadata": usage,
    }
    return json.dumps(body).encode("utf-8")


def _http_err(code: int, *, headers: Optional[dict] = None,
              body: bytes = b"err") -> urllib.error.HTTPError:
    return urllib.error.HTTPError(
        url="https://generativelanguage.googleapis.com/x",
        code=code,
        msg=f"HTTP {code}",
        hdrs=headers or {},
        fp=None,
    )


# ---------------------------------------------------------------------------
# 1. Happy paths (image, video, audio)
# ---------------------------------------------------------------------------

def test_score_artifact_happy_path_image(small_png):
    transport = _MockTransport(_MockResponse(
        status=200, body=_ok_body(score=0.85), headers={"x-request-id": "req-1"},
    ))
    result = score_artifact(
        artifact_path=small_png,
        artifact_modality="image",
        prompt="Score this frame on production quality, 0-1.",
        judge_id="judge_image_v1",
        transport=transport,
    )
    assert isinstance(result, EvalProviderResult)
    assert result.score == 0.85
    assert "production-grade" in result.reasoning.lower()
    assert result.cost_usd > 0
    assert result.model_used == DEFAULT_MODEL_ID
    assert result.request_id == "req-1"
    assert result.raw_metadata["judge_id"] == "judge_image_v1"
    assert result.raw_metadata["artifact_modality"] == "image"
    assert "warnings" in result.raw_metadata
    assert transport.call_count == 1


def test_score_artifact_happy_path_video(small_mp4):
    transport = _MockTransport(_MockResponse(
        status=200, body=_ok_body(score=0.6, reasoning="OK pacing."),
    ))
    result = score_artifact(
        artifact_path=small_mp4,
        artifact_modality="video",
        prompt="Score this clip.",
        judge_id="judge_video_v1",
        transport=transport,
    )
    assert result.score == 0.6
    assert result.raw_metadata["artifact_modality"] == "video"
    sent_body = json.loads(transport.calls[0]["body"].decode("utf-8"))
    parts = sent_body["contents"][0]["parts"]
    assert parts[1]["inlineData"]["mimeType"] == "video/mp4"


def test_score_artifact_happy_path_audio(small_mp3):
    transport = _MockTransport(_MockResponse(
        status=200, body=_ok_body(score=0.42, reasoning="Audio reads."),
    ))
    result = score_artifact(
        artifact_path=small_mp3,
        artifact_modality="audio",
        prompt="Score this clip.",
        judge_id="judge_audio_v1",
        transport=transport,
    )
    assert result.score == 0.42
    sent_body = json.loads(transport.calls[0]["body"].decode("utf-8"))
    parts = sent_body["contents"][0]["parts"]
    # mp3 mime can be image/mpeg under some mimetypes registries — accept
    # either of the audio mp3 spellings.
    mime = parts[1]["inlineData"]["mimeType"]
    assert mime in ("audio/mpeg", "audio/mp3")


# ---------------------------------------------------------------------------
# 2. Pre-flight validation
# ---------------------------------------------------------------------------

def test_score_artifact_unsupported_modality(small_png):
    transport = _MockTransport()
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png,
            artifact_modality="text",
            prompt="x",
            judge_id="j",
            transport=transport,
        )
    assert transport.call_count == 0


def test_score_artifact_missing_api_key(small_png, monkeypatch):
    monkeypatch.delenv("GEMINI_API_KEY", raising=False)
    monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
    transport = _MockTransport()
    with pytest.raises(EvalAuthError):
        score_artifact(
            artifact_path=small_png,
            artifact_modality="image",
            prompt="x",
            judge_id="j",
            transport=transport,
        )
    assert transport.call_count == 0


def test_score_artifact_artifact_does_not_exist(tmp_path):
    transport = _MockTransport()
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=tmp_path / "nope.png",
            artifact_modality="image",
            prompt="x",
            judge_id="j",
            transport=transport,
        )
    assert transport.call_count == 0


def test_score_artifact_artifact_too_large(tmp_path, monkeypatch):
    """Avoid actually writing >14 MB; lower the threshold for this test."""
    monkeypatch.setattr(gv, "INLINE_DATA_RAW_FALLBACK_THRESHOLD", 1024)
    big = tmp_path / "huge.mp4"
    big.write_bytes(b"\x00" * 4096)
    transport = _MockTransport()
    with pytest.raises(EvalPayloadError) as excinfo:
        score_artifact(
            artifact_path=big,
            artifact_modality="video",
            prompt="x",
            judge_id="j",
            transport=transport,
        )
    assert "Files API" in str(excinfo.value)
    assert transport.call_count == 0


# ---------------------------------------------------------------------------
# 3. HTTP error mapping (fail-fast paths)
# ---------------------------------------------------------------------------

def test_score_artifact_400_payload(small_png):
    transport = _MockTransport(_http_err(400))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 1


def test_score_artifact_401(small_png):
    transport = _MockTransport(_http_err(401))
    with pytest.raises(EvalAuthError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 1


def test_score_artifact_403(small_png):
    transport = _MockTransport(_http_err(403))
    with pytest.raises(EvalAuthError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 1


def test_score_artifact_404(small_png):
    transport = _MockTransport(_http_err(404))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 1


def test_score_artifact_402_quota_reserved(small_png):
    transport = _MockTransport(_http_err(402))
    with pytest.raises(EvalQuotaError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 1


# ---------------------------------------------------------------------------
# 4. Retry behavior — 429 / 500 / 503 / 504
# ---------------------------------------------------------------------------

def test_score_artifact_429_retried_then_succeeds(small_png):
    transport = _MockTransport(
        _http_err(429), _http_err(429),
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    # 2 failed + 1 success = 3 transport calls.
    assert transport.call_count == 3


def test_score_artifact_429_exhausts_then_raises(small_png):
    # 3 retries means the 4th call is the last attempt; need 4 errors.
    transport = _MockTransport(*[_http_err(429) for _ in range(4)])
    with pytest.raises(EvalRateLimitError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 4


def test_score_artifact_500_retried_then_fails(small_png):
    transport = _MockTransport(*[_http_err(500) for _ in range(4)])
    with pytest.raises(EvalServerError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 4


def test_score_artifact_500_then_succeeds(small_png):
    transport = _MockTransport(
        _http_err(500),
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    assert transport.call_count == 2


def test_score_artifact_503_uses_longer_backoff_schedule(small_png, monkeypatch):
    """503 retries 3x, with the longer 5/10/20 schedule (jittered)."""
    sleeps: list[float] = []
    monkeypatch.setattr(gv.time, "sleep", lambda s: sleeps.append(s))

    transport = _MockTransport(
        _http_err(503), _http_err(503), _http_err(503),
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    assert transport.call_count == 4
    # 3 retries -> 3 sleeps. With pinned jitter (mean of [base*0.8, base*1.2])
    # sleeps should equal the 503 schedule exactly: 5, 10, 20.
    assert sleeps == [5.0, 10.0, 20.0]


def test_score_artifact_504_retries_only_2x(small_png, monkeypatch):
    """504 retries 2x (NOT 3x). Audit § 12a item 7."""
    sleeps: list[float] = []
    monkeypatch.setattr(gv.time, "sleep", lambda s: sleeps.append(s))

    # 504 budget = 2 retries -> 3 calls total before raising. So 3 errors.
    transport = _MockTransport(_http_err(504), _http_err(504), _http_err(504))
    with pytest.raises(EvalServerError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 3
    # 2 sleeps from the 504 schedule (2, 4).
    assert sleeps == [2.0, 4.0]


def test_score_artifact_retry_after_header_honored(small_png, monkeypatch):
    """Retry-After header (in seconds, capped at 30) overrides schedule."""
    sleeps: list[float] = []
    monkeypatch.setattr(gv.time, "sleep", lambda s: sleeps.append(s))

    err_with_retry_after = _http_err(429, headers={"Retry-After": "7"})
    transport = _MockTransport(
        err_with_retry_after,
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    assert sleeps == [7.0]


def test_score_artifact_retry_after_capped_at_30s(small_png, monkeypatch):
    sleeps: list[float] = []
    monkeypatch.setattr(gv.time, "sleep", lambda s: sleeps.append(s))

    err = _http_err(429, headers={"Retry-After": "120"})
    transport = _MockTransport(
        err, _MockResponse(status=200, body=_ok_body()),
    )
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert sleeps == [30.0]


# ---------------------------------------------------------------------------
# 5. Network errors (URLError / socket.timeout)
# ---------------------------------------------------------------------------

def test_score_artifact_network_blip_retries(small_png):
    transport = _MockTransport(
        urllib.error.URLError("conn reset"),
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    assert transport.call_count == 2


def test_score_artifact_network_blip_fails_after_retries(small_png):
    transport = _MockTransport(
        urllib.error.URLError("e1"),
        urllib.error.URLError("e2"),
        urllib.error.URLError("e3"),
        urllib.error.URLError("e4"),
    )
    with pytest.raises(EvalNetworkError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert transport.call_count == 4


def test_score_artifact_socket_timeout_retried(small_png):
    transport = _MockTransport(
        socket.timeout("slow"),
        _MockResponse(status=200, body=_ok_body()),
    )
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.85
    assert transport.call_count == 2


def test_score_artifact_network_exhaustion_chains_original(small_png):
    """§ 12a item 10 — EvalNetworkError must wrap the original via `from`."""
    original = urllib.error.URLError("dns kaput")
    transport = _MockTransport(original, original, original, original)
    with pytest.raises(EvalNetworkError) as excinfo:
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert excinfo.value.__cause__ is not None
    assert isinstance(excinfo.value.__cause__, urllib.error.URLError)


def test_score_artifact_http_exception_chains_original(small_png):
    """`raise X from e` chaining for HTTPError → typed exception."""
    err = _http_err(401)
    transport = _MockTransport(err)
    with pytest.raises(EvalAuthError) as excinfo:
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert excinfo.value.__cause__ is err


# ---------------------------------------------------------------------------
# 6. Response parsing failures
# ---------------------------------------------------------------------------

def test_score_artifact_response_not_json(small_png):
    transport = _MockTransport(_MockResponse(status=200, body=b"not JSON"))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


def test_score_artifact_response_missing_candidates(small_png):
    body = json.dumps({"candidates": []}).encode("utf-8")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


def test_score_artifact_verdict_text_not_json(small_png):
    body = _ok_body(raw_text_override="this is not json")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


def test_score_artifact_verdict_missing_score(small_png):
    body = _ok_body(raw_text_override=json.dumps({"reasoning": "..."}))
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


def test_score_artifact_verdict_missing_reasoning(small_png):
    body = _ok_body(raw_text_override=json.dumps({"score": 0.5}))
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalPayloadError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


def test_score_artifact_score_clipped_high(small_png):
    body = _ok_body(score=1.4, reasoning="too high")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 1.0
    assert result.raw_metadata["score_clipped"]["original"] == 1.4
    assert "score_clipped" in result.raw_metadata["warnings"]


def test_score_artifact_score_clipped_low(small_png):
    body = _ok_body(score=-0.2, reasoning="too low")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.0
    assert result.raw_metadata["score_clipped"]["original"] == -0.2


def test_score_artifact_code_fences_stripped(small_png):
    body = _ok_body(score=0.55, reasoning="fenced.", fence_wrapped=True)
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.55


# ---------------------------------------------------------------------------
# 7. MAX_TOKENS handling — § 12a item 6
# ---------------------------------------------------------------------------

def test_score_artifact_max_tokens_clean_parse_warns(small_png):
    body = _ok_body(score=0.7, reasoning="just made it.", finish="MAX_TOKENS")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.score == 0.7
    assert "truncated_max_tokens" in result.raw_metadata["warnings"]


def test_score_artifact_max_tokens_unparseable_raises(small_png):
    body = _ok_body(
        raw_text_override='{"score": 0.7, "reasoning": "trunc',
        finish="MAX_TOKENS",
    )
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalPayloadError) as excinfo:
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )
    assert "truncated_unparseable" in str(excinfo.value)


def test_score_artifact_safety_finish_reason_raises(small_png):
    body = _ok_body(finish="SAFETY")
    transport = _MockTransport(_MockResponse(status=200, body=body))
    with pytest.raises(EvalServerError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j", transport=transport,
        )


# ---------------------------------------------------------------------------
# 8. Token / cost compute — fallbacks
# ---------------------------------------------------------------------------

def test_score_artifact_missing_candidates_token_count_falls_back(small_png):
    """totalTokenCount fallback when candidatesTokenCount missing."""
    body = _ok_body(prompt_tokens=200, cand_tokens=None, total_tokens=350)
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    # output = 350 - 200 = 150 tokens; cost = 200/1k * 0.002 + 150/1k * 0.012
    expected_cost = (200 / 1000) * 0.002 + (150 / 1000) * 0.012
    assert result.cost_usd == pytest.approx(expected_cost)
    assert "missing_token_count" not in result.raw_metadata["warnings"]


def test_score_artifact_missing_both_token_counts_warns_zero_output(small_png):
    body = _ok_body(prompt_tokens=200, cand_tokens=None, total_tokens=None)
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    # output_cost = 0 because output_tokens fell back to 0.
    expected_cost = (200 / 1000) * 0.002
    assert result.cost_usd == pytest.approx(expected_cost)
    assert "missing_token_count" in result.raw_metadata["warnings"]


def test_score_artifact_long_context_bucket(small_png, monkeypatch):
    """promptTokenCount > 200_000 switches to long-context rates."""
    body = _ok_body(prompt_tokens=300_000, cand_tokens=100)
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    expected_cost = (300_000 / 1000) * 0.004 + (100 / 1000) * 0.018
    assert result.cost_usd == pytest.approx(expected_cost)


def test_score_artifact_cost_zero_when_profile_missing(small_png, monkeypatch):
    """Phase 3 hasn't written the profile yet — Phase 2 must not crash."""
    import sys
    import types
    fake_module = types.ModuleType("recoil.core.model_profiles")

    def fake_get_profile(model_id):
        raise KeyError(model_id)

    fake_module.get_profile = fake_get_profile
    monkeypatch.setitem(sys.modules, "recoil.core.model_profiles", fake_module)

    body = _ok_body()
    transport = _MockTransport(_MockResponse(status=200, body=body))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert result.cost_usd == 0.0


# ---------------------------------------------------------------------------
# 9. Auth precedence — § 12a item 11
# ---------------------------------------------------------------------------

def test_auth_uses_gemini_api_key_over_google_api_key(small_png, monkeypatch):
    monkeypatch.setenv("GEMINI_API_KEY", "gemini-key")
    monkeypatch.setenv("GOOGLE_API_KEY", "google-key")
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    sent_headers = transport.calls[0]["headers"]
    assert sent_headers["x-goog-api-key"] == "gemini-key"


def test_auth_falls_back_to_google_api_key(small_png, monkeypatch):
    monkeypatch.delenv("GEMINI_API_KEY", raising=False)
    monkeypatch.setenv("GOOGLE_API_KEY", "google-fallback")
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    sent_headers = transport.calls[0]["headers"]
    assert sent_headers["x-goog-api-key"] == "google-fallback"


def test_auth_custom_env_var_does_not_consult_google_key(small_png, monkeypatch):
    monkeypatch.delenv("GEMINI_API_KEY", raising=False)
    monkeypatch.setenv("GOOGLE_API_KEY", "should-not-be-used")
    transport = _MockTransport()
    with pytest.raises(EvalAuthError):
        score_artifact(
            artifact_path=small_png, artifact_modality="image",
            prompt="x", judge_id="j",
            api_key_env_var="MY_CUSTOM_KEY",
            transport=transport,
        )


# ---------------------------------------------------------------------------
# 10. Request body shape — § 12a items 2, 12
# ---------------------------------------------------------------------------

def test_request_body_uses_camelcase_keys(small_png):
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="rubric", judge_id="j", transport=transport,
    )
    sent = json.loads(transport.calls[0]["body"].decode("utf-8"))
    assert "generationConfig" in sent
    assert "snake_case_no" not in sent
    assert sent["generationConfig"]["maxOutputTokens"] == 1024
    assert sent["generationConfig"]["responseMimeType"] == "application/json"
    parts = sent["contents"][0]["parts"]
    assert parts[1].get("inlineData") is not None
    assert "inline_data" not in parts[1]
    assert "mimeType" in parts[1]["inlineData"]
    assert "mime_type" not in parts[1]["inlineData"]


def test_request_body_uses_standard_base64_not_urlsafe(small_png):
    """§ 12a item 12 — base64.standard_b64encode (not urlsafe). Verified by
    re-decoding the data with standard_b64decode and matching the file bytes."""
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    sent = json.loads(transport.calls[0]["body"].decode("utf-8"))
    data_b64 = sent["contents"][0]["parts"][1]["inlineData"]["data"]
    # Standard b64 alphabet uses + and /; urlsafe uses - and _.
    assert "-" not in data_b64
    assert "_" not in data_b64
    decoded = base64.standard_b64decode(data_b64)
    assert decoded == small_png.read_bytes()


def test_request_url_uses_default_model_id(small_png):
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="j", transport=transport,
    )
    assert "gemini-3.1-pro-preview" in transport.calls[0]["url"]
    assert ":generateContent" in transport.calls[0]["url"]


def test_default_model_id_is_preview_suffix():
    """§ 12a item 1 — the bare 'gemini-3.1-pro' would 404."""
    assert DEFAULT_MODEL_ID == "gemini-3.1-pro-preview"


# ---------------------------------------------------------------------------
# 11. judge_id thread-through
# ---------------------------------------------------------------------------

def test_judge_id_threads_through_raw_metadata(small_png):
    transport = _MockTransport(_MockResponse(status=200, body=_ok_body()))
    result = score_artifact(
        artifact_path=small_png, artifact_modality="image",
        prompt="x", judge_id="my_judge_xyz", transport=transport,
    )
    assert result.raw_metadata["judge_id"] == "my_judge_xyz"


# ---------------------------------------------------------------------------
# 12. Files-API helper (Phase 2 ships sketch only)
# ---------------------------------------------------------------------------

def test_build_part_for_file_uri_uses_name_field_shape():
    """§ 12a item 13 — fileUri is the 'name' field ('files/abc-123')."""
    part = gv._build_part_for_file_uri("files/abc-123", "video/mp4")
    assert part == {"fileData": {"mimeType": "video/mp4", "fileUri": "files/abc-123"}}


# ---------------------------------------------------------------------------
# 13. Public surface assertions
# ---------------------------------------------------------------------------

def test_public_surface_exports():
    assert hasattr(gv, "score_artifact")
    assert EvalProviderResult.__dataclass_fields__.keys() >= {
        "score", "reasoning", "cost_usd", "model_used", "request_id",
        "raw_metadata",
    }
    # Exception tree
    assert issubclass(EvalAuthError, EvalProviderError)
    assert issubclass(EvalQuotaError, EvalProviderError)
    assert issubclass(EvalPayloadError, EvalProviderError)
    assert issubclass(EvalRateLimitError, EvalProviderError)
    assert issubclass(EvalServerError, EvalProviderError)
    assert issubclass(EvalNetworkError, EvalProviderError)
    assert PRIMARY_AUTH_ENV_VAR == "GEMINI_API_KEY"