"""CP-9 Phase 3 — attach_eval_hooks utility.

Coverage:
  - Returns a (pre_step, post_step, on_failure) triple of callables.
  - pre_step is a no-op.
  - post_step writes scorecard into receipt.eval_scores[panel_id] when
    step succeeded with output_path; in-place mutation respects the
    frozen-receipt + mutable-dict-field rule (audit § 12e).
  - post_step skips eval modalities (no recursion) and unknown modalities.
  - post_step skips when no output_path or no receipt.
  - post_step swallows panel exceptions (NEVER raises).
  - on_failure runs eval when eval_on_failure=True and output present;
    skipped when False or no output_path.
  - Cumulative eval_cost_usd accumulates into receipt.provenance.

All panels are stub fakes — no live API calls.
"""

import sys
import pathlib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional
from unittest.mock import MagicMock

import pytest

sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent.parent.parent))
from recoil.core.paths import ensure_pipeline_importable  # noqa: E402

ensure_pipeline_importable()

from recoil.pipeline.core.eval import (  # noqa: E402
    EvalContext, attach_eval_hooks,
)


# ── Test doubles ────────────────────────────────────────────────────

class _FakePanel:
    """Stub panel — returns a deterministic scorecard. Tracks calls."""
    def __init__(
        self, panel_id: str = "fake_panel",
        scorecard: Optional[dict] = None,
        raise_exc: Optional[Exception] = None,
    ) -> None:
        self.panel_id = panel_id
        self._scorecard = scorecard or {
            "panel_id": panel_id,
            "panel_score": 0.75,
            "panel_warnings": [],
            "judges": [],
            "aggregation": "median",
            "panel_cost_usd": 0.05,
        }
        self._raise_exc = raise_exc
        self.calls: list[tuple[Any, EvalContext]] = []

    def score(self, receipt: Any, context: EvalContext) -> dict:
        self.calls.append((receipt, context))
        if self._raise_exc:
            raise self._raise_exc
        return self._scorecard


@dataclass
class _FakeRunResult:
    output_path: Optional[str] = None


@dataclass
class _FakeReceipt:
    """Stub receipt — exposes the surface attach_eval_hooks reads/writes.

    Real GenerationReceipt is frozen, but eval_scores + provenance are
    mutable dicts (audit § 12e). We only assert the in-place mutation
    pattern, so a non-frozen stub is fine for this contract.
    """
    run_result: Optional[_FakeRunResult] = None
    eval_scores: dict[str, Any] = field(default_factory=dict)
    provenance: dict[str, Any] = field(default_factory=dict)


@dataclass
class _FakeStep:
    step_id: str = "step1"
    modality: str = "image_t2i"
    payload: dict = field(default_factory=lambda: {"prompt": "p"})
    receipt: Optional[_FakeReceipt] = None
    status: str = "succeeded"


def _fake_workflow() -> Any:
    """Workflow is unused inside the hooks — a sentinel suffices."""
    return MagicMock(name="workflow")


def _make_succeeded_step(
    *, modality: str = "image_t2i", output_path: str = "/tmp/img.png",
) -> _FakeStep:
    return _FakeStep(
        modality=modality,
        receipt=_FakeReceipt(run_result=_FakeRunResult(output_path=output_path)),
        status="succeeded",
    )


def _make_panel(panel_id: str = "fake_panel") -> _FakePanel:
    return _FakePanel(panel_id=panel_id)


# ── Triple shape ────────────────────────────────────────────────────

def test_attach_eval_hooks_returns_three_callables() -> None:
    panel = _make_panel()
    pre, post, on_fail = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    assert callable(pre)
    assert callable(post)
    assert callable(on_fail)


def test_pre_step_is_noop() -> None:
    panel = _make_panel()
    pre, _, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step()
    assert pre(step, _fake_workflow()) is None
    assert step.receipt.eval_scores == {}  # pre_step did nothing


# ── post_step happy paths ───────────────────────────────────────────

def test_post_step_writes_eval_scores_on_success() -> None:
    panel = _make_panel("p_writes")
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step()
    post(step, _fake_workflow())
    assert "p_writes" in step.receipt.eval_scores
    assert step.receipt.eval_scores["p_writes"]["panel_score"] == 0.75
    assert len(panel.calls) == 1


def test_post_step_passes_eval_context_with_correct_fields() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step(modality="image_t2i", output_path="/tmp/foo.png")
    post(step, _fake_workflow())
    receipt_arg, ctx_arg = panel.calls[0]
    assert receipt_arg is step.receipt
    assert isinstance(ctx_arg, EvalContext)
    assert ctx_arg.target_artifact_path == Path("/tmp/foo.png")
    assert ctx_arg.rubric  # non-empty default
    assert ctx_arg.judge_id == panel.panel_id
    assert ctx_arg.prompt == "p"  # pulled from payload


def test_post_step_routes_lipsync_to_video_rubric() -> None:
    """lipsync_post output is a video — rubric_per_modality["video"] used."""
    panel = _make_panel()
    custom_rubrics = {
        "image": "rub_image",
        "video": "rub_video",
        "audio": "rub_audio",
    }
    _, post, _ = attach_eval_hooks(  # type: ignore[arg-type]
        _fake_workflow(), panel, rubric_per_modality=custom_rubrics,
    )
    step = _make_succeeded_step(modality="lipsync_post", output_path="/tmp/x.mp4")
    post(step, _fake_workflow())
    _, ctx = panel.calls[0]
    assert ctx.rubric == "rub_video"


def test_post_step_audio_modality_uses_audio_rubric() -> None:
    panel = _make_panel()
    custom_rubrics = {"image": "ri", "video": "rv", "audio": "ra"}
    _, post, _ = attach_eval_hooks(  # type: ignore[arg-type]
        _fake_workflow(), panel, rubric_per_modality=custom_rubrics,
    )
    step = _make_succeeded_step(modality="audio_t2a", output_path="/tmp/x.mp3")
    post(step, _fake_workflow())
    _, ctx = panel.calls[0]
    assert ctx.rubric == "ra"


# ── post_step skip paths ────────────────────────────────────────────

def test_post_step_skips_eval_modality() -> None:
    """eval_image_v1 / eval_video_v1 / eval_audio_v1 must NOT recurse."""
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step(modality="eval_image_v1", output_path="/tmp/x.png")
    post(step, _fake_workflow())
    assert panel.calls == []
    assert step.receipt.eval_scores == {}


def test_post_step_skips_unknown_modality() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step(modality="some_unknown_modality")
    post(step, _fake_workflow())
    assert panel.calls == []
    assert step.receipt.eval_scores == {}


def test_post_step_skips_when_no_output_path() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _FakeStep(
        modality="image_t2i",
        receipt=_FakeReceipt(run_result=_FakeRunResult(output_path=None)),
        status="succeeded",
    )
    post(step, _fake_workflow())
    assert panel.calls == []


def test_post_step_handles_missing_receipt() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _FakeStep(
        modality="image_t2i", receipt=None, status="succeeded",
    )
    # Must not raise.
    post(step, _fake_workflow())
    assert panel.calls == []


def test_post_step_skips_when_step_not_succeeded() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step()
    step.status = "failed"
    post(step, _fake_workflow())
    assert panel.calls == []


# ── post_step swallows exceptions ───────────────────────────────────

def test_post_step_swallows_panel_exception() -> None:
    panel = _FakePanel(raise_exc=RuntimeError("simulated panel blowup"))
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step()
    # Must NOT raise.
    post(step, _fake_workflow())
    assert step.receipt.eval_scores == {}


def test_post_step_swallows_context_construction_exception() -> None:
    """If an eval modality is unmapped, _eval_step's guards skip silently
    rather than crash the executor."""
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    # A step where modality attribute is the wrong type — guard catches it.
    step = _make_succeeded_step()
    step.modality = None  # type: ignore[assignment]
    post(step, _fake_workflow())  # must NOT raise


# ── on_failure ──────────────────────────────────────────────────────

def test_on_failure_runs_eval_when_eval_on_failure_True() -> None:
    panel = _make_panel("p_fail")
    _, _, on_fail = attach_eval_hooks(  # type: ignore[arg-type]
        _fake_workflow(), panel, eval_on_failure=True,
    )
    step = _make_succeeded_step()
    step.status = "failed"
    on_fail(step, _fake_workflow())
    assert "p_fail" in step.receipt.eval_scores
    assert len(panel.calls) == 1
    _, ctx = panel.calls[0]
    assert ctx.metadata.get("reason") == "step failed"


def test_on_failure_skipped_when_eval_on_failure_False() -> None:
    panel = _make_panel()
    _, _, on_fail = attach_eval_hooks(  # type: ignore[arg-type]
        _fake_workflow(), panel, eval_on_failure=False,
    )
    step = _make_succeeded_step()
    step.status = "failed"
    on_fail(step, _fake_workflow())
    assert panel.calls == []


def test_on_failure_skipped_when_no_output_path() -> None:
    panel = _make_panel()
    _, _, on_fail = attach_eval_hooks(  # type: ignore[arg-type]
        _fake_workflow(), panel, eval_on_failure=True,
    )
    step = _FakeStep(
        modality="image_t2i",
        receipt=_FakeReceipt(run_result=_FakeRunResult(output_path=None)),
        status="failed",
    )
    on_fail(step, _fake_workflow())
    assert panel.calls == []


# ── Provenance accumulation ─────────────────────────────────────────

def test_post_step_eval_cost_accumulates_in_provenance() -> None:
    panel = _make_panel("p_cost")
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step1 = _make_succeeded_step()
    post(step1, _fake_workflow())
    assert step1.receipt.provenance["eval_cost_usd"] == pytest.approx(0.05)

    # Second eval against the SAME receipt should accumulate.
    post(step1, _fake_workflow())
    assert step1.receipt.provenance["eval_cost_usd"] == pytest.approx(0.10)


def test_post_step_eval_cost_starts_from_existing_provenance_value() -> None:
    """If receipt.provenance already has eval_cost_usd, we add to it."""
    panel = _make_panel("p_cost2")
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    step = _make_succeeded_step()
    step.receipt.provenance["eval_cost_usd"] = 0.20
    post(step, _fake_workflow())
    assert step.receipt.provenance["eval_cost_usd"] == pytest.approx(0.25)


# ── Default rubrics ─────────────────────────────────────────────────

def test_default_rubrics_present_for_image_video_audio() -> None:
    panel = _make_panel()
    _, post, _ = attach_eval_hooks(_fake_workflow(), panel)  # type: ignore[arg-type]
    for mod, output in (
        ("image_t2i", "/tmp/x.png"),
        ("video_i2v", "/tmp/x.mp4"),
        ("audio_t2a", "/tmp/x.mp3"),
        ("lipsync_post", "/tmp/x.mp4"),
    ):
        panel.calls.clear()
        step = _make_succeeded_step(modality=mod, output_path=output)
        post(step, _fake_workflow())
        assert len(panel.calls) == 1, f"modality={mod} did not invoke panel"
        _, ctx = panel.calls[0]
        assert ctx.rubric  # non-empty default