"""CP-9 Phase 6 integration test 1 — dispatch() of an eval modality yields
a real GenerationReceipt with eval_score / eval_reasoning / judge_id /
model_used / eval_cost_usd populated in receipt.run_result.metadata.

Exercises the full chain end-to-end with REAL production code:
    register_default_eval_runners()
      → registry stores EvalImageRunner / EvalVideoRunner / EvalAudioRunner
    register_eval_node(modality_id, FakeEvalNode)
      → EvalRegistry binds judge_id → judge instance
    dispatch("eval_image_v1", payload, ctx)
      → _ensure_bootstrap → register_default_runners(step_runner)
      → registry.get_runner("eval_image_v1") → EvalImageRunner
      → EvalImageRunner.run(payload)
      → get_eval_node("eval_image_v1") → FakeEvalNode
      → FakeEvalNode.evaluate(EvalContext) → EvalResult
      → RunResult.metadata populated with the 6 eval keys
      → GenerationReceipt wraps the RunResult and the JSONL log gets a line

Mocking is limited to the EvalNode boundary (FakeEvalNode replaces the real
GeminiVisionEvalNode that would hit Gemini). Everything else is the real
production code path.
"""

import sys
import pathlib

sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent.parent.parent))
from recoil.core.paths import ensure_pipeline_importable  # noqa: E402

ensure_pipeline_importable()

import pytest  # noqa: E402

from recoil.pipeline.core.dispatch import (  # noqa: E402
    dispatch,
    register_default_eval_runners,
    _reset_bootstrap_for_tests,
)
from recoil.pipeline.core.dispatch_context import DispatchContext  # noqa: E402
from recoil.pipeline.core.eval import (  # noqa: E402
    register_eval_node,
    _reset_eval_registry_for_tests,
)
from recoil.pipeline.core.receipts import GenerationReceipt  # noqa: E402
from recoil.pipeline.core.registry import (  # noqa: E402
    MODALITY_EVAL_AUDIO_V1,
    MODALITY_EVAL_IMAGE_V1,
    MODALITY_EVAL_VIDEO_V1,
    _reset_for_tests,
)

from recoil.pipeline.core.tests._eval_test_helpers import (  # noqa: E402
    FakeEvalNode,
    StubStepRunner,
)


@pytest.fixture(autouse=True)
def reset_registries():
    _reset_for_tests()
    _reset_bootstrap_for_tests()
    _reset_eval_registry_for_tests()
    yield
    _reset_for_tests()
    _reset_bootstrap_for_tests()
    _reset_eval_registry_for_tests()


def _ctx(**overrides) -> DispatchContext:
    base = dict(
        caller_id="phase6_integration",
        step_runner=StubStepRunner(),
        receipts_log_path="DISABLED",
    )
    base.update(overrides)
    return DispatchContext(**base)


def test_dispatch_eval_image_v1_yields_real_receipt_with_eval_metadata(tmp_path):
    """eval_image_v1 dispatch end-to-end → GenerationReceipt with the 6 eval keys."""
    register_default_eval_runners()
    judge = FakeEvalNode(
        judge_id=MODALITY_EVAL_IMAGE_V1,
        score=0.83,
        reasoning="Composition strong, lighting matches brief.",
        cost_usd=0.0042,
        model_used="fake-judge-image",
    )
    register_eval_node(MODALITY_EVAL_IMAGE_V1, judge)

    artifact = tmp_path / "kf.png"
    artifact.write_bytes(b"\x89PNG\r\n\x1a\n")  # tiny valid header — judge mocked

    receipt = dispatch(
        MODALITY_EVAL_IMAGE_V1,
        {
            "shot_id": "EP001_SH02",
            "artifact_path": str(artifact),
            "rubric": "Score this image 0-1.",
        },
        context=_ctx(project="tartarus", episode=1, caller_id="phase6_image"),
    )

    # Receipt-level identity
    assert isinstance(receipt, GenerationReceipt)
    assert receipt.modality == MODALITY_EVAL_IMAGE_V1
    assert receipt.shot_id == "EP001_SH02"
    assert receipt.project == "tartarus"
    assert receipt.episode == 1
    assert receipt.caller_id == "phase6_image"

    # RunResult-level identity
    rr = receipt.run_result
    assert rr.success is True
    assert rr.error is None
    assert rr.modality == MODALITY_EVAL_IMAGE_V1
    assert rr.output_path is None  # eval produces no artifact, only a verdict

    # The 6 eval keys populated in metadata
    md = rr.metadata
    assert md["final_state"] == "succeeded"
    assert md["eval_score"] == pytest.approx(0.83)
    assert md["eval_reasoning"] == "Composition strong, lighting matches brief."
    assert md["judge_id"] == MODALITY_EVAL_IMAGE_V1
    assert md["model_used"] == "fake-judge-image"
    assert md["eval_cost_usd"] == pytest.approx(0.0042)

    # The judge actually saw the EvalContext built by the runner
    assert len(judge.calls) == 1
    ctx = judge.calls[0]
    assert ctx.target_artifact_path == artifact
    assert ctx.rubric == "Score this image 0-1."
    assert ctx.judge_id == MODALITY_EVAL_IMAGE_V1


def test_dispatch_eval_video_v1_yields_real_receipt_with_eval_metadata(tmp_path):
    register_default_eval_runners()
    judge = FakeEvalNode(
        judge_id=MODALITY_EVAL_VIDEO_V1,
        score=0.71,
        reasoning="Motion plausible.",
        cost_usd=0.011,
        model_used="fake-judge-video",
    )
    register_eval_node(MODALITY_EVAL_VIDEO_V1, judge)

    artifact = tmp_path / "v.mp4"
    artifact.write_bytes(b"\x00\x00\x00\x18ftyp")

    receipt = dispatch(
        MODALITY_EVAL_VIDEO_V1,
        {
            "shot_id": "EP001_SH02",
            "artifact_path": str(artifact),
            "rubric": "Score this video 0-1.",
        },
        context=_ctx(),
    )

    assert isinstance(receipt, GenerationReceipt)
    assert receipt.run_result.success is True
    md = receipt.run_result.metadata
    assert md["eval_score"] == pytest.approx(0.71)
    assert md["judge_id"] == MODALITY_EVAL_VIDEO_V1
    assert md["model_used"] == "fake-judge-video"
    assert md["eval_cost_usd"] == pytest.approx(0.011)


def test_dispatch_eval_audio_v1_yields_real_receipt_with_eval_metadata(tmp_path):
    register_default_eval_runners()
    judge = FakeEvalNode(
        judge_id=MODALITY_EVAL_AUDIO_V1,
        score=0.49,
        reasoning="Voice match thin.",
        cost_usd=0.0007,
        model_used="fake-judge-audio",
    )
    register_eval_node(MODALITY_EVAL_AUDIO_V1, judge)

    artifact = tmp_path / "a.mp3"
    artifact.write_bytes(b"ID3\x04\x00\x00\x00\x00\x00")

    receipt = dispatch(
        MODALITY_EVAL_AUDIO_V1,
        {
            "shot_id": "EP001_VO01",
            "artifact_path": str(artifact),
            "rubric": "Score this audio 0-1.",
        },
        context=_ctx(),
    )

    assert receipt.run_result.success is True
    md = receipt.run_result.metadata
    assert md["eval_score"] == pytest.approx(0.49)
    assert md["judge_id"] == MODALITY_EVAL_AUDIO_V1
    assert md["model_used"] == "fake-judge-audio"
    assert md["eval_cost_usd"] == pytest.approx(0.0007)


def test_dispatch_eval_image_v1_jsonl_log_carries_eval_metadata(tmp_path):
    """The JSONL audit log line for an eval-modality dispatch carries the
    same 6 eval keys, demonstrating the receipt round-trips through the log."""
    import json

    register_default_eval_runners()
    judge = FakeEvalNode(
        judge_id=MODALITY_EVAL_IMAGE_V1,
        score=0.62,
        reasoning="serviceable",
        cost_usd=0.0023,
    )
    register_eval_node(MODALITY_EVAL_IMAGE_V1, judge)

    log_path = tmp_path / "receipts.jsonl"
    artifact = tmp_path / "kf.png"
    artifact.write_bytes(b"\x89PNG\r\n\x1a\n")

    receipt = dispatch(
        MODALITY_EVAL_IMAGE_V1,
        {
            "shot_id": "EP001_SH02",
            "artifact_path": str(artifact),
            "rubric": "Score this 0-1.",
        },
        context=_ctx(receipts_log_path=str(log_path)),
    )

    lines = [
        json.loads(line) for line in log_path.read_text().splitlines() if line.strip()
    ]
    assert len(lines) == 1
    serialized = lines[0]
    assert serialized["modality"] == MODALITY_EVAL_IMAGE_V1
    assert serialized["receipt_id"] == receipt.receipt_id
    md = serialized["run_result"]["metadata"]
    assert md["eval_score"] == pytest.approx(0.62)
    assert md["eval_reasoning"] == "serviceable"
    assert md["judge_id"] == MODALITY_EVAL_IMAGE_V1
    assert md["eval_cost_usd"] == pytest.approx(0.0023)


def test_dispatch_eval_modality_failure_path_yields_failure_receipt(tmp_path):
    """When the EvalNode raises, the runner translates to a failure-RunResult
    with the 6-key failure metadata. Verifies the runner contract — eval
    failures NEVER propagate as exceptions."""
    register_default_eval_runners()
    judge = FakeEvalNode(
        judge_id=MODALITY_EVAL_IMAGE_V1,
        raise_exc=RuntimeError("simulated judge blowup"),
    )
    register_eval_node(MODALITY_EVAL_IMAGE_V1, judge)

    artifact = tmp_path / "kf.png"
    artifact.write_bytes(b"\x89PNG")

    receipt = dispatch(
        MODALITY_EVAL_IMAGE_V1,
        {
            "shot_id": "EP001_SH02",
            "artifact_path": str(artifact),
            "rubric": "Score 0-1.",
        },
        context=_ctx(),
    )

    rr = receipt.run_result
    assert rr.success is False
    assert "RuntimeError" in (rr.error or "")
    assert "simulated judge blowup" in (rr.error or "")
    md = rr.metadata
    assert md["final_state"] == "failed"
    assert md["eval_score"] is None
    assert md["eval_reasoning"] is None
    assert md["eval_cost_usd"] == 0.0
    assert md["judge_id"] == MODALITY_EVAL_IMAGE_V1
    assert md.get("error_class") == "RuntimeError"