"""CP-9 Phase 6 integration test 3 — on_failure hook ALSO runs eval.

Per § 12e R4, the on_failure hook returned by attach_eval_hooks runs eval
for diagnostic logging on FAILED steps (default eval_on_failure=True). The
diagnostic scorecard carries an EvalContext.metadata['reason'] indicator so
downstream consumers can distinguish "eval of a successful artifact" from
"eval of an artifact whose step failed."

This costs roughly 2× the eval bill on failed steps; flagged for spec review
but Phase 6 ships with default = ON.

Mocking is limited to the EvalNode boundary and the StepRunner boundary.
Workflow / dispatch / attach_eval_hooks / PanelOfJudges are real.
"""

import sys
import pathlib
from types import SimpleNamespace

sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent.parent.parent))
from recoil.core.paths import ensure_pipeline_importable  # noqa: E402

ensure_pipeline_importable()

import pytest  # noqa: E402

from recoil.pipeline.core.dispatch import _reset_bootstrap_for_tests  # noqa: E402
from recoil.pipeline.core.dispatch_context import DispatchContext  # noqa: E402
from recoil.pipeline.core.eval import (  # noqa: E402
    PanelOfJudges,
    attach_eval_hooks,
    _reset_eval_registry_for_tests,
)
from recoil.pipeline.core.registry import _reset_for_tests  # noqa: E402
from recoil.pipeline.core.workflow import Workflow, WorkflowStep  # noqa: E402

from recoil.pipeline.core.tests._eval_test_helpers import FakeEvalNode  # noqa: E402


@pytest.fixture(autouse=True)
def reset_registries():
    _reset_for_tests()
    _reset_bootstrap_for_tests()
    _reset_eval_registry_for_tests()
    yield
    _reset_for_tests()
    _reset_bootstrap_for_tests()
    _reset_eval_registry_for_tests()


class _FailingButProducesArtifactStepRunner:
    """StepRunner whose execute_keyframe returns success=False AND a
    non-None output_path — the only way to exercise on_failure with an
    artifact present (so the eval hook has something to score).

    A real StepRunner producing an artifact-but-failing-the-gate is the
    canonical "failed step with output_path" pattern (rejected take with
    a render on disk for forensics).
    """

    def __init__(self) -> None:
        self._dispatch_path = "unknown"
        self.calls: list[tuple[str, dict]] = []

    def execute_keyframe(self, **kw):
        self.calls.append(("keyframe", dict(kw)))
        return SimpleNamespace(
            shot_id=kw.get("shot_id"),
            success=False,  # failure: e.g. gate verdict was "reject"
            final_state="rejected_by_gate",
            output_path="/tmp/rejected_kf.png",
            cost_usd=0.04,
            error="gate verdict: face crop too tight",
            take_index=0,
            gate_verdict="reject",
            model="nbp",
            pipeline="still",
        )

    def execute_video(self, **kw):  # not used in this test
        raise NotImplementedError


def _wf() -> Workflow:
    return Workflow(
        workflow_id="wf_on_failure",
        steps=[
            WorkflowStep(
                step_id="kf",
                modality="image_t2i",
                payload={"shot_id": "EP001_SH02", "prompt": "p", "model": "nbp", "aspect_ratio": "9_16"},
            ),
        ],
    )


def test_on_failure_hook_runs_eval_writes_diagnostic_scorecard():
    """on_failure runs the panel against the failed step's artifact and
    writes the scorecard with reason='step failed' in the EvalContext."""
    sr = _FailingButProducesArtifactStepRunner()
    ctx = DispatchContext(
        caller_id="phase6_on_failure",
        step_runner=sr,
        receipts_log_path="DISABLED",
    )

    judge = FakeEvalNode(judge_id="diag_j", score=0.21, cost_usd=0.001)
    panel = PanelOfJudges(panel_id="diag_panel", judges=[judge])

    _, post_step, on_failure = attach_eval_hooks(
        _wf(), panel, eval_on_failure=True
    )

    wf = _wf()
    wf.run(context=ctx, post_step=post_step, on_failure=on_failure)

    # Step failed (StepRunner returned success=False).
    step = wf.steps[0]
    assert step.status == "failed"
    assert step.receipt is not None
    assert step.receipt.run_result.success is False
    assert step.receipt.run_result.output_path == "/tmp/rejected_kf.png"

    # post_step fires for ALL non-skipped steps, INCLUDING failed ones —
    # but post_step's `_eval_step` requires step.status == "succeeded" to run.
    # So post_step is a no-op here. on_failure is what produces the scorecard.
    assert "diag_panel" in step.receipt.eval_scores
    scorecard = step.receipt.eval_scores["diag_panel"]
    assert scorecard["panel_score"] == pytest.approx(0.21)

    # The judge saw an EvalContext with reason='step failed' indicator.
    assert len(judge.calls) == 1
    eval_ctx = judge.calls[0]
    assert eval_ctx.metadata.get("reason") == "step failed"


def test_on_failure_hook_disabled_writes_no_scorecard():
    """eval_on_failure=False → on_failure becomes a no-op even when the
    step has an output_path. Confirms the kill switch works for callers
    who don't want the 2× cost on failures."""
    sr = _FailingButProducesArtifactStepRunner()
    ctx = DispatchContext(
        caller_id="phase6_on_failure",
        step_runner=sr,
        receipts_log_path="DISABLED",
    )

    judge = FakeEvalNode(judge_id="diag_off_j", score=0.5)
    panel = PanelOfJudges(panel_id="diag_off_panel", judges=[judge])

    _, post_step, on_failure = attach_eval_hooks(
        _wf(), panel, eval_on_failure=False
    )

    wf = _wf()
    wf.run(context=ctx, post_step=post_step, on_failure=on_failure)

    step = wf.steps[0]
    assert step.status == "failed"
    # No scorecard written — judge was never invoked.
    assert step.receipt.eval_scores == {}
    assert len(judge.calls) == 0


def test_on_failure_hook_records_panel_cost_in_provenance():
    """Even on the failure path, eval cost is stamped into provenance —
    the diagnostic eval is a real Gemini call in production, and the bill
    must be visible alongside the generation cost."""
    sr = _FailingButProducesArtifactStepRunner()
    ctx = DispatchContext(
        caller_id="phase6_on_failure",
        step_runner=sr,
        receipts_log_path="DISABLED",
    )

    judge = FakeEvalNode(judge_id="cost_diag_j", score=0.0, cost_usd=0.0123)
    panel = PanelOfJudges(panel_id="cost_diag_panel", judges=[judge])

    _, post_step, on_failure = attach_eval_hooks(_wf(), panel)

    wf = _wf()
    wf.run(context=ctx, post_step=post_step, on_failure=on_failure)

    step = wf.steps[0]
    assert step.status == "failed"
    assert step.receipt.provenance["eval_cost_usd"] == pytest.approx(0.0123)
