"""Visual StoryGate core primitives.

StoryGate is built by composition, not by subclassing CriticLoop. Its verdict
taxonomy is board-local and deliberately separate from take-loop failure modes.
SSOT: consultations/recoil/visual-story-gates-2026-06-11/SYNTHESIS.md.
"""

from __future__ import annotations

import base64
from dataclasses import dataclass
import errno
import hashlib
from io import BytesIO
import json
import os
from pathlib import Path
import tempfile
import time
from typing import Any

import anthropic
from PIL import Image

from recoil.core.anthropic_client import anthropic_client
from recoil.core.atomic_write import jsonl_append_locked
from recoil.core.model_profiles import get_model
from recoil.pipeline._lib.board_builder import GRID_LAYOUTS
from recoil.pipeline._lib.recoil_bridge import load_project_config


SCHEMA_VERSION = 1
PROMPT_VERSION = "story_gate_rubric_v5"
ROUTES = (
    "ok",
    "board_problem",
    "script_problem",
    "prompt_problem",
    "mixed",
    "judge_unavailable",
)
SEVERITIES = ("HARD", "SOFT")
MODES = ("off", "shadow", "enforce")

_STORY_GATE_ENV_VAR = "RECOIL_STORY_GATE"
_STORY_GATE_CONFIG_KEY = "story_gate_mode"
_AUTHORING_MARKER = "AUTHORING"
_SCENE_CONTEXT_MARKER = "SCENE CONTEXT"

_TEXT_STAGEABILITY_OUTPUT_CONTRACT = """OUTPUT CONTRACT - JSON only:
{"stageable": bool, "findings": [{"beat_index": int, "check": "causal_setup_present", "passed": bool, "severity": "HARD|SOFT", "confidence": float, "problem_kind": str, "reason": str, "injectable": false, "suggested_script_question": str}]}
Field rules (the validator rejects any deviation):
- One finding per numbered beat; EVERY finding carries EVERY field above, including passing findings.
- "severity" is "HARD" or "SOFT" even when passed is true: state the severity a failure would carry.
- "problem_kind" on a passing finding is "none".
- "suggested_script_question" is a NON-EMPTY string on every finding — for a passing finding, write the stageability question you verified (e.g. "Is the cause of the arm-catch established? Yes: ..."). Never null, never empty.
"""

_IMAGE_ORDERING_SECTION = """IMAGE ORDERING
Images are attached in this exact order: image 1 = full board; image 2 = panel 1 crop; image 3 = panel 2 crop; continue with per-panel crops in index order.
Reference panels using both panel index and image number when useful.
"""

_ACTOR_TARGET_SECTION = """ACTOR/TARGET SLOTS
Before viewing images, extract per beat {actor, target, required_relation, forbidden_staging} from the beat text AND the scene context (the causal ground truth).
If the scene context states where a character's attention or gaze is at the moment of a beat (e.g. "She stares at the pod." immediately before the beat's line), that object IS the beat's gaze target even when the beat text omits it.
Use these slots only as requirements to test against the literal images, not as evidence that something is visible.
"""

_DESCRIBE_SECTION = """DESCRIBE
For each panel crop, describe literal visual evidence first. Describe only what is visible. Do not infer intended story action from the beat text. If ambiguous, say ambiguous.
Include: characters present; screen-position of each character and named prop (left/center/right × foreground/midground/background); body and face orientation of each character (toward camera / profile-left / away / etc.); gaze direction of each character; and the action mid-motion.
For each named prop, state explicitly whether it is IN FRONT OF or BEHIND each character's facing direction (their shoulder plane), citing the occlusion evidence: state which object overlaps/occludes which — the occluded object is the one behind. A prop that a character's body overlaps or that sits over their shoulder is BEHIND them — say so plainly; do not soften it to "beside" or "midground".
"""

_FORCED_CHECKS_SECTION = """FORCED CHECKS
For each panel, provide binary checks with reasons: depicts_beat, spatially_possible, eyeline_consistent, object_of_gaze_in_frame_and_front, causal_setup_present.
causal_setup_present judges story-order causality: the panel's action must be motivated by PRIOR panels and ground truth as of that moment — never by later panels. Reachability must be established for the ACTING character (another character's proximity to the target does not establish it). A cause counts only if you can point to where the prior material states or directly implies it — a motivation you have to invent is absent ⇒ false, HARD.
object_of_gaze_in_frame_and_front asks: is the object this character looks at in front of them and in frame? If it is behind them or out of frame while the beat requires their attention on it ⇒ false, severity HARD.
A character required to stare at X while their face points at the camera and X sits behind their shoulder plane is NOT staring at X ⇒ false, HARD. If the front-of/behind relation is ambiguous in the drawing, mark false, SOFT, low confidence — never resolve spatial ambiguity in the image's favor.
A gaze binding from the scene context is a requirement, not a suggestion: if the ground truth states the character stares at X during the beat and the drawn eyeline goes to the camera or anywhere else, eyeline_consistent and object_of_gaze_in_frame_and_front both FAIL — "delivered to camera" / "legitimate reaction-shot choice" is not an excuse for a stated stare.
Confidence floor on eyeline_consistent and object_of_gaze_in_frame_and_front: if your confidence that the check PASSES is below 0.8, mark passed=false, severity SOFT. A shaky pass is a fail — low-confidence passes are exactly how staging errors slip through.
If your description says the target object is behind the character and the beat requires the character to face it, you must mark spatially_possible=false.
"""

_TRANSITIONS_SECTION = """TRANSITIONS
For each panel N→N+1: Name the visible or scripted cause in panel N (or the scene context) that produces the change in panel N+1. If you cannot name one, FAIL causal_setup_present, severity HARD.
A valid cause must supply BOTH: (a) purpose — why the actor acts on THIS target now; and (b) spatial precondition — the actor is positioned, in panel N or established context, to physically reach the target. Capability is not cause: "it is alive" / "the chassis moves" explains that the actor CAN act, not why it grabs this object or how it got within reach. If purpose or reachability is unestablished, FAIL causal_setup_present, severity HARD.
"""

_ROUTING_SECTION = """ROUTING
Classify the board with exactly one route:
ok - nothing wrong.
board_problem - text supported coherent staging but the image contradicted/omitted it.
script_problem - the beat text itself lacks the needed cause/specification.
prompt_problem - the beat/context required X but the as-sent generation prompt (provided below) never encoded X.
mixed - both real.
judge_unavailable - judge could not complete a reliable verdict.
If ANY forced check failed on any panel or transition, the route cannot be ok — pick the route matching where the defect lives (image staging ⇒ board_problem; missing story cause ⇒ script_problem; un-encoded requirement ⇒ prompt_problem).
Provide confidence and a one-line evidence value.
"""

_CONTRADICTION_TRAP_SECTION = """CONTRADICTION TRAP + NO BENEFIT OF THE DOUBT
A panel that is beautiful, on-style, and technically clean still fails if staging makes the described action physically impossible or causally unmotivated.
Do not pass a panel because it could be interpreted charitably; if a required spatial relation is ambiguous, mark the check false with SOFT severity and low confidence.
"""

_FIX_HINTS_SECTION = """FIX HINTS
Every panel and transition carries fix_hint and fix_hint_injectable. For failed ones, write a concrete fix_hint; for passing ones use fix_hint null and fix_hint_injectable false.
injectable = obeyable by the image model without changing what HAPPENS in the beat.
If the fix requires changing the event or adding missing story cause, fix_hint_injectable must be false.
"""

_IMAGE_OUTPUT_CONTRACT = """OUTPUT CONTRACT - JSON only:
Return exactly one JSON object, no prose outside it, matching this shape (use these EXACT key names):
{
  "text_stageability": null,
  "panels": [
    {"index": 1,
     "description": "<literal visual evidence>",
     "forced_checks": {
       "depicts_beat": {"passed": true, "severity": "SOFT", "confidence": 0.9, "reason": "<why>"},
       "spatially_possible": {"passed": true, "severity": "HARD", "confidence": 0.9, "reason": "<why>"},
       "eyeline_consistent": {"passed": true, "severity": "SOFT", "confidence": 0.9, "reason": "<why>"},
       "object_of_gaze_in_frame_and_front": {"passed": true, "severity": "HARD", "confidence": 0.9, "reason": "<why>"},
       "causal_setup_present": {"passed": true, "severity": "HARD", "confidence": 0.9, "reason": "<why>"}
     },
     "fix_hint": null,
     "fix_hint_injectable": false}
  ],
  "transitions": [
    {"from": 1, "to": 2,
     "forced_checks": {
       "causal_setup_present": {"passed": true, "severity": "HARD", "confidence": 0.9, "reason": "<why>"}
     },
     "fix_hint": null,
     "fix_hint_injectable": false}
  ],
  "routing": {"class": "ok", "confidence": 0.9, "evidence": "<one line>"}
}
Field rules (the validator rejects any deviation):
- panels: one entry per panel, ALL indexes 1..N including passing panels.
- transitions: one entry for EVERY consecutive pair (1,2), (2,3), ... including passing ones.
- Every forced-check entry carries ALL FOUR fields: "passed" (boolean — the verdict; key is "passed", not "value"), "severity" ("HARD" or "SOFT" — required even when passed is true: state the severity a FAILURE of this check would carry), "confidence" (number 0..1), "reason" (string).
- "fix_hint_injectable" is a boolean on EVERY panel and transition; when nothing failed use fix_hint null and fix_hint_injectable false.
- routing key is "class" (not "route"); value is exactly one of: ok, board_problem, script_problem, prompt_problem, mixed, judge_unavailable.
- text_stageability is null in the image call because it is merged by the caller. Omit schema_version/judge_model/prompt_version/board_id/source_sha256 — the caller stamps them.
"""

_JSON_REASK = (
    "Your previous reply was not valid JSON. Reply with ONLY the JSON object."
)


class StoryGateJudgeUnavailable(RuntimeError):
    """Raised when the external judge cannot produce a reliable verdict."""

    def __init__(self, reason: str) -> None:
        self.reason = reason
        super().__init__(reason)


def story_gate_mode(project: str) -> str:
    """Resolve story gate mode from env, project config, then default off."""
    mode = os.environ.get(_STORY_GATE_ENV_VAR)
    if mode is None:
        cfg = load_project_config(project) or {}
        mode = cfg.get(_STORY_GATE_CONFIG_KEY) or "off"
    if mode not in MODES:
        raise ValueError(
            f"invalid story gate mode {mode!r}; expected one of {', '.join(MODES)}"
        )
    return mode


@dataclass
class StoryGatePacket:
    board_id: str
    board_png: Path
    grid_cols: int
    grid_rows: int
    slots: int
    generation_prompt: str
    beats_text: str
    scene_context: str | None
    panels: list[dict]
    source_sha256: str
    character_descriptions: dict | None

    @classmethod
    def from_sidecar(cls, png_path: Path) -> "StoryGatePacket":
        png_path = Path(png_path)
        sidecar_path = Path(f"{png_path}.json")
        data = json.loads(sidecar_path.read_text(encoding="utf-8"))

        for key in ("prompt", "panels", "source_sha256"):
            if key not in data:
                raise ValueError(f"sidecar missing {key}")

        prompt = data["prompt"]
        panels = data["panels"]
        source_sha256 = data["source_sha256"]
        if not isinstance(prompt, str):
            raise ValueError("sidecar prompt must be a string")
        if not isinstance(panels, list):
            raise ValueError("sidecar panels must be a list")
        if not isinstance(source_sha256, str) or not source_sha256:
            raise ValueError("sidecar source_sha256 must be a non-empty string")

        authoring_idx = prompt.find(_AUTHORING_MARKER)
        if authoring_idx < 0:
            raise ValueError("prompt missing AUTHORING marker")
        scene_idx = prompt.find(_SCENE_CONTEXT_MARKER)

        slots = 4 if len(panels) <= 4 else 6
        _size_override, grid_cols, grid_rows = GRID_LAYOUTS[slots]
        scene_context = None
        if 0 <= scene_idx < authoring_idx:
            scene_context = prompt[scene_idx:authoring_idx]

        char_descs = data.get("character_descriptions")
        if char_descs is not None and not isinstance(char_descs, dict):
            raise ValueError("sidecar character_descriptions must be a dict")

        return cls(
            board_id=png_path.stem,
            board_png=png_path,
            grid_cols=grid_cols,
            grid_rows=grid_rows,
            slots=slots,
            generation_prompt=prompt,
            beats_text=prompt[authoring_idx:],
            scene_context=scene_context,
            panels=panels,
            source_sha256=source_sha256,
            character_descriptions=char_descs,
        )


def build_text_stageability_prompt(packet: StoryGatePacket) -> str:
    """Build the text-only pre-generation stageability prompt."""
    scene_context = packet.scene_context or "(none provided)"
    return "\n\n".join(
        [
            "STORY GATE TEXT-STAGEABILITY JUDGE",
            f"BOARD ID\n{packet.board_id}",
            "SCENE CONTEXT - causal ground truth\n" + scene_context,
            "PER-PANEL BEATS\n" + packet.beats_text,
            (
                "TASK\n"
                "Answer FOR EACH numbered beat. Decide whether it can be staged "
                "as a single panel without inventing a missing cause. For each "
                "beat identify who is the actor, what is the target, what visible "
                "action is required, what prior state must be visible/inferable, "
                "and whether the action is motivated by the prior beat or the "
                "scene context."
            ),
            (
                "HARD FAIL RULE\n"
                "A beat whose required cause/motivation is absent from beats plus "
                'causal ground truth must return passed: false, severity: "HARD".\n'
                "A valid cause must supply BOTH: (a) purpose — why the actor acts "
                "on THIS target now; and (b) spatial precondition — the actor's "
                "position established by a prior beat or the ground truth puts the "
                "target within reach. Capability is not cause: \"it is alive\" / "
                "\"the chassis moves\" explains that the actor CAN act, not why it "
                "grabs this object or how it got within reach. Missing purpose ⇒ "
                "problem_kind absent_cause; missing reachability ⇒ problem_kind "
                "ambiguous_prior_state; both are HARD.\n"
                "Judge causality in STORY ORDER: a beat's motivation must be "
                "inferable from PRIOR beats and ground truth as of that moment — "
                "never from later beats. If you find yourself justifying beat N "
                "with what happens in beat N+1, the cause is absent ⇒ HARD.\n"
                "Reachability is about the ACTOR: the acting character's own "
                "position must be established near the target — another "
                "character's proximity to the target does not establish it.\n"
                "LOCATE, don't construct: a cause counts only if you can quote or "
                "point to where the text states or directly implies it. If your "
                "reason has to invent the motivation, the cause is absent ⇒ HARD."
            ),
            (
                "FINDING RULES\n"
                "Use check = causal_setup_present. Use injectable = false because "
                "this is a script-stageability pass, not an image-prompt repair. "
                "Use problem_kind to distinguish absent_cause, absent_actor, "
                "absent_target, absent_visible_action, ambiguous_prior_state, "
                "or none."
            ),
            _TEXT_STAGEABILITY_OUTPUT_CONTRACT,
        ]
    )


def build_image_judge_prompt(packet: StoryGatePacket, *, use_crops: bool = True) -> str:
    """Build the post-generation visual story gate rubric prompt."""
    scene_context = packet.scene_context or "(none provided)"
    character_descriptions = (
        json.dumps(packet.character_descriptions, indent=2, sort_keys=True)
        if packet.character_descriptions
        else "(none provided)"
    )
    metadata = (
        f"BOARD ID\n{packet.board_id}\n\n"
        f"GRID\ncols={packet.grid_cols}, rows={packet.grid_rows}, slots={packet.slots}\n\n"
        f"SOURCE SHA256\n{packet.source_sha256}"
    )
    image_ordering = (
        _IMAGE_ORDERING_SECTION
        if use_crops
        else (
            "IMAGE ORDERING\n"
            "Exactly ONE image is attached: image 1 = the full board. "
            "No per-panel crops are attached; judge each panel within the "
            "full board image.\n"
        )
    )
    source_sections = "\n\n".join(
        [
            "SCENE CONTEXT - causal ground truth\n" + scene_context,
            "PER-PANEL BEATS\n" + packet.beats_text,
            "CHARACTER DESCRIPTIONS\n" + character_descriptions,
        ]
    )
    generation_prompt = (
        "AS-SENT GENERATION PROMPT FOR prompt_problem COMPARISON\n"
        + packet.generation_prompt
    )
    return "\n\n".join(
        [
            "STORY GATE IMAGE JUDGE",
            metadata,
            image_ordering,
            source_sections,
            _ACTOR_TARGET_SECTION,
            _DESCRIBE_SECTION,
            _FORCED_CHECKS_SECTION,
            _TRANSITIONS_SECTION,
            _ROUTING_SECTION,
            _CONTRADICTION_TRAP_SECTION,
            _FIX_HINTS_SECTION,
            generation_prompt,
            _IMAGE_OUTPUT_CONTRACT,
        ]
    )


def crop_panels(board_png: Path, cols: int, rows: int, slots: int) -> list[bytes]:
    """Return row-major PNG crops for the first ``slots`` grid cells."""
    crops: list[bytes] = []
    with Image.open(board_png) as image:
        image = image.convert("RGBA")
        width, height = image.size
        cell_w = width // cols
        cell_h = height // rows
        for row in range(rows):
            for col in range(cols):
                if len(crops) >= slots:
                    return crops
                crop = image.crop(
                    (
                        col * cell_w,
                        row * cell_h,
                        (col + 1) * cell_w,
                        (row + 1) * cell_h,
                    )
                )
                buffer = BytesIO()
                crop.save(buffer, format="PNG")
                crops.append(buffer.getvalue())
    return crops


def validate_verdict(verdict: dict) -> list[str]:
    """Return structural verdict problems; an empty list means valid."""
    problems: list[str] = []
    if not isinstance(verdict, dict):
        return ["verdict must be a dict"]

    required = (
        "schema_version",
        "judge_model",
        "prompt_version",
        "board_id",
        "source_sha256",
        "text_stageability",
        "panels",
        "transitions",
        "routing",
    )
    for key in required:
        if key not in verdict:
            problems.append(f"missing top-level key: {key}")

    routing = verdict.get("routing")
    if not isinstance(routing, dict):
        problems.append("routing must be a dict")
    else:
        route = routing.get("class")
        if route not in ROUTES:
            problems.append(f"routing.class invalid: {route!r}")
        _check_confidence(
            routing.get("confidence"),
            "routing.confidence",
            problems,
        )

    panels = verdict.get("panels")
    if not isinstance(panels, list):
        problems.append("panels must be a list")
        panels = []
    for idx, panel in enumerate(panels):
        if not isinstance(panel, dict):
            problems.append(f"panels[{idx}] must be a dict")
            continue
        _validate_panel(panel, f"panels[{idx}]", problems)

    transitions = verdict.get("transitions")
    if not isinstance(transitions, list):
        problems.append("transitions must be a list")
        transitions = []
    if len(panels) > 1 and len(transitions) == 0:
        problems.append("transitions empty for multi-panel verdict")
    for idx, transition in enumerate(transitions):
        if not isinstance(transition, dict):
            problems.append(f"transitions[{idx}] must be a dict")
            continue
        _validate_transition(transition, f"transitions[{idx}]", problems)

    return problems


def _validate_panel(panel: dict, path: str, problems: list[str]) -> None:
    if not isinstance(panel.get("index"), int):
        problems.append(f"{path}.index missing or not int")
    description = panel.get("description")
    if not isinstance(description, str) or not description.strip():
        problems.append(f"{path}.description missing or empty")
    forced_checks = panel.get("forced_checks")
    if not isinstance(forced_checks, dict):
        problems.append(f"{path}.forced_checks missing or not dict")
    else:
        _validate_forced_checks(forced_checks, f"{path}.forced_checks", problems, required=REQUIRED_PANEL_CHECKS)
    if not isinstance(panel.get("fix_hint_injectable"), bool):
        problems.append(f"{path}.fix_hint_injectable missing or not bool")


def _validate_transition(transition: dict, path: str, problems: list[str]) -> None:
    if not isinstance(transition.get("from"), int):
        problems.append(f"{path}.from missing or not int")
    if not isinstance(transition.get("to"), int):
        problems.append(f"{path}.to missing or not int")
    forced_checks = transition.get("forced_checks")
    if not isinstance(forced_checks, dict):
        problems.append(f"{path}.forced_checks missing or not dict")
    else:
        _validate_forced_checks(forced_checks, f"{path}.forced_checks", problems, required=REQUIRED_TRANSITION_CHECKS)
    if not isinstance(transition.get("fix_hint_injectable"), bool):
        problems.append(f"{path}.fix_hint_injectable missing or not bool")


REQUIRED_PANEL_CHECKS = (
    "depicts_beat",
    "spatially_possible",
    "eyeline_consistent",
    "object_of_gaze_in_frame_and_front",
    "causal_setup_present",
)
REQUIRED_TRANSITION_CHECKS = ("causal_setup_present",)


def _validate_forced_checks(
    forced_checks: dict,
    path: str,
    problems: list[str],
    required: tuple[str, ...] = (),
) -> None:
    # Empty/missing named checks must NOT validate — a judge that skips the
    # forced worksheet is a contract violation, not an ok verdict (gate r5).
    for name in required:
        if name not in forced_checks:
            problems.append(f"{path}.{name} is required")
    for name, entry in forced_checks.items():
        entry_path = f"{path}.{name}"
        if not isinstance(entry, dict):
            problems.append(f"{entry_path} must be a dict")
            continue
        if not isinstance(entry.get("passed"), bool):
            problems.append(f"{entry_path}.passed missing or not bool")
        severity = entry.get("severity")
        if severity not in SEVERITIES:
            problems.append(f"{entry_path}.severity invalid: {severity!r}")
        _check_confidence(entry.get("confidence"), f"{entry_path}.confidence", problems)
        if not isinstance(entry.get("reason"), str):
            problems.append(f"{entry_path}.reason missing or not str")


def _check_confidence(value: Any, path: str, problems: list[str]) -> None:
    if isinstance(value, bool) or not isinstance(value, (int, float)):
        problems.append(f"{path} missing or not number")
        return
    if not 0 <= float(value) <= 1:
        problems.append(f"{path} out of range [0, 1]: {value!r}")


def _judge_call(
    prompt: str,
    images: list[bytes] | None,
    *,
    model: str | None = None,
    max_attempts: int = 3,
) -> dict:
    """Call the StoryGate judge and parse its first JSON object response."""

    resolved_model = model or get_model("board_critic", "qc")
    try:
        raw = _judge_raw_call(
            prompt,
            images,
            model=resolved_model,
            max_attempts=max_attempts,
        )
        try:
            return _parse_json_object(raw)
        except (json.JSONDecodeError, ValueError):
            # Re-ask consumes ONE attempt — max_attempts is the TOTAL budget
            # (spec contract; gate r6), not a per-raw-call budget.
            raw = _judge_raw_call(
                f"{prompt}\n\n{_JSON_REASK}",
                images,
                model=resolved_model,
                max_attempts=1,
            )
            try:
                return _parse_json_object(raw)
            except (json.JSONDecodeError, ValueError) as exc:
                raise StoryGateJudgeUnavailable(f"invalid JSON: {exc}") from exc
    except StoryGateJudgeUnavailable:
        raise
    except Exception as exc:
        raise StoryGateJudgeUnavailable(str(exc) or type(exc).__name__) from exc


def _judge_raw_call(
    prompt: str,
    images: list[bytes] | None,
    *,
    model: str,
    max_attempts: int,
) -> str:
    if max_attempts < 1:
        raise StoryGateJudgeUnavailable("max_attempts must be >= 1")

    from recoil.core.claude_cli import claude_transport

    if claude_transport() == "cli":
        # CLI lane: same subscription, different ingress — immune to the
        # SDK lane's opaque tier throttle (live-proven 2026-06-11).
        return _judge_raw_call_cli(prompt, images, model=model)

    client = anthropic_client()
    last_exc: Exception | None = None
    for attempt in range(1, max_attempts + 1):
        try:
            response = client.messages.create(
                model=model,
                max_tokens=8192,
                messages=[{"role": "user", "content": _message_content(prompt, images)}],
            )
            return _response_text(response)
        except Exception as exc:
            if not _is_retryable_judge_error(exc):
                raise StoryGateJudgeUnavailable(str(exc) or type(exc).__name__) from exc
            last_exc = exc
            if attempt >= max_attempts:
                break
            time.sleep(2 if attempt == 1 else 8)

    reason = str(last_exc) if last_exc else "judge retries exhausted"
    raise StoryGateJudgeUnavailable(reason or "judge retries exhausted")


def _judge_raw_call_cli(
    prompt: str,
    images: list[bytes] | None,
    *,
    model: str,
) -> str:
    """CLI-lane judge call: image bytes go to a tempdir for the agent's Read."""

    import tempfile

    from recoil.core.claude_cli import ClaudeCliError, claude_cli_call

    try:
        with tempfile.TemporaryDirectory(prefix="story_gate_judge_") as tmp:
            paths: list[Path] = []
            for ordinal, image in enumerate(images or [], start=1):
                path = Path(tmp) / f"image_{ordinal:02d}.png"
                path.write_bytes(image)
                paths.append(path)
            return claude_cli_call(prompt, paths or None, model=model)
    except ClaudeCliError as exc:
        raise StoryGateJudgeUnavailable(str(exc)) from exc


def _message_content(prompt: str, images: list[bytes] | None) -> list[dict]:
    content: list[dict] = []
    for image in images or []:
        content.append(
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": base64.b64encode(image).decode("ascii"),
                },
            }
        )
    content.append({"type": "text", "text": prompt})
    return content


def _response_text(response: Any) -> str:
    parts: list[str] = []
    for block in getattr(response, "content", []) or []:
        text = getattr(block, "text", None)
        if text is None and isinstance(block, dict):
            text = block.get("text")
        if isinstance(text, str):
            parts.append(text)
    return "\n".join(parts)


def _parse_json_object(text: str) -> dict:
    start = text.find("{")
    end = text.rfind("}")
    if start < 0 or end < start:
        raise ValueError("no JSON object found")
    parsed = json.loads(text[start : end + 1])
    if not isinstance(parsed, dict):
        raise ValueError("JSON response must be an object")
    return parsed


def _is_retryable_judge_error(exc: Exception) -> bool:
    if isinstance(exc, anthropic.RateLimitError):
        return True
    status_code = getattr(exc, "status_code", None)
    if status_code == 429:
        return True
    if isinstance(status_code, int) and 500 <= status_code <= 599:
        return True
    if isinstance(exc, (TimeoutError, getattr(anthropic, "APITimeoutError", TimeoutError))):
        return True
    return False


def validate_text_stageability(payload: object) -> list[str]:
    """Structural validation for the Phase-2 text-stageability contract.

    Fail-closed companion to validate_verdict: a parseable-but-malformed
    response (e.g. {}) must NOT be treated as a clean ok verdict.
    """
    problems: list[str] = []
    if not isinstance(payload, dict):
        return ["text_stageability response must be a JSON object"]
    if not isinstance(payload.get("stageable"), bool):
        problems.append("stageable must be a boolean")
    findings = payload.get("findings")
    if not isinstance(findings, list):
        problems.append("findings must be a list")
        return problems
    for idx, finding in enumerate(findings):
        if not isinstance(finding, dict):
            problems.append(f"findings[{idx}] must be an object")
            continue
        for key in ("beat_index", "check", "passed", "severity", "reason"):
            if key not in finding:
                problems.append(f"findings[{idx}].{key} is required")
        beat_index = finding.get("beat_index")
        if beat_index is not None and (
            isinstance(beat_index, bool) or not isinstance(beat_index, int)
        ):
            problems.append(f"findings[{idx}].beat_index must be an integer")
        passed = finding.get("passed")
        if passed is not None and not isinstance(passed, bool):
            problems.append(f"findings[{idx}].passed must be a boolean")
        severity = finding.get("severity")
        if severity is not None and severity not in SEVERITIES:
            problems.append(f"findings[{idx}].severity invalid: {severity!r}")
        if "injectable" in finding and not isinstance(finding["injectable"], bool):
            problems.append(f"findings[{idx}].injectable must be a boolean")
        # EVERY finding carries the full Phase-2 contract (gate r5: the
        # contract does not distinguish passing from failed findings).
        # NOTE: `check` equality with causal_setup_present is deliberately NOT
        # enforced — pinning one value would break legitimately-added checks
        # (gate r4 partial refutation, standing).
        confidence = finding.get("confidence")
        if (
            not isinstance(confidence, (int, float))
            or isinstance(confidence, bool)
            or not (0.0 <= float(confidence) <= 1.0)
        ):
            problems.append(f"findings[{idx}].confidence must be a number in [0,1]")
        if "injectable" not in finding:
            problems.append(f"findings[{idx}].injectable is required")
        for key in ("problem_kind", "suggested_script_question"):
            if not finding.get(key):
                problems.append(f"findings[{idx}].{key} is required")
    return problems


class StoryGate:
    """Composition wrapper for StoryGate judge entry points."""

    def __init__(self, mode: str = "shadow") -> None:
        if mode not in MODES:
            raise ValueError(
                f"invalid story gate mode {mode!r}; expected one of {', '.join(MODES)}"
            )
        if mode == "enforce":
            raise NotImplementedError("story gate enforce ships in v1.1")
        self.mode = mode

    def evaluate_text(
        self,
        packet: StoryGatePacket,
        *,
        model: str | None = None,
    ) -> dict:
        resolved_model = model or get_model("board_critic", "qc")
        prompt = build_text_stageability_prompt(packet)
        text_stageability = _judge_call(prompt, None, model=resolved_model)
        problems = validate_text_stageability(text_stageability)
        if problems:
            # One re-ask naming the contract violations, then fail closed —
            # a malformed text verdict must never read as ok (gate r3).
            text_stageability = _judge_call(
                prompt
                + "\n\nYour previous reply violated the output contract ("
                + "; ".join(problems[:5])
                + "). Reply with ONLY the JSON object matching the contract.",
                None,
                model=resolved_model,
            )
            problems = validate_text_stageability(text_stageability)
            if problems:
                raise StoryGateJudgeUnavailable(
                    "text_stageability contract violation: " + "; ".join(problems[:5])
                )
        hard_failure = _has_hard_failed_text_finding(text_stageability)
        first_reason = _first_failed_text_reason(text_stageability)
        return {
            "schema_version": SCHEMA_VERSION,
            "judge_model": resolved_model,
            "prompt_version": PROMPT_VERSION,
            "board_id": packet.board_id,
            "source_sha256": packet.source_sha256,
            "text_stageability": text_stageability,
            "panels": [],
            "transitions": [],
            "routing": {
                "class": "script_problem" if hard_failure else "ok",
                "confidence": 1.0 if hard_failure else 0.0,
                "evidence": first_reason if hard_failure else "text stageable",
            },
        }

    def evaluate_board(
        self,
        packet: StoryGatePacket,
        *,
        use_crops: bool = True,
        model: str | None = None,
    ) -> dict:
        resolved_model = model or get_model("board_critic", "qc")
        images = [packet.board_png.read_bytes()]
        if use_crops:
            images.extend(
                crop_panels(
                    packet.board_png,
                    packet.grid_cols,
                    packet.grid_rows,
                    packet.slots,
                )
            )

        prompt = build_image_judge_prompt(packet, use_crops=use_crops)
        verdict = _judge_call(prompt, images, model=resolved_model)
        verdict = _stamp_verdict(verdict, packet, resolved_model)
        problems = validate_verdict(verdict) + _coverage_problems(verdict, packet)
        if problems:
            repair_prompt = (
                f"{prompt}\n\nYour previous reply did not match the required "
                f"verdict schema: {'; '.join(problems)}. Reply with ONLY a "
                "corrected JSON object."
            )
            verdict = _judge_call(repair_prompt, images, model=resolved_model)
            verdict = _stamp_verdict(verdict, packet, resolved_model)
            problems = validate_verdict(verdict) + _coverage_problems(verdict, packet)
            if problems:
                raise StoryGateJudgeUnavailable(
                    "invalid verdict schema: " + "; ".join(problems)
                )
        return verdict


def _coverage_problems(verdict: dict, packet: StoryGatePacket) -> list[str]:
    """Panel/transition coverage against the packet — an image verdict that
    omits panels (or their N->N+1 transitions) is partial, not ok (gate r6)."""
    problems: list[str] = []
    panels = verdict.get("panels")
    if not isinstance(panels, list):
        return ["panels must be a list"]
    indexes = {
        panel.get("index")
        for panel in panels
        if isinstance(panel, dict)
    }
    beat_count = len(packet.panels) if packet.panels else int(packet.slots)
    expected = set(range(1, beat_count + 1))
    if indexes != expected:
        problems.append(
            f"panel coverage mismatch: expected indexes {sorted(expected)}, got {sorted(i for i in indexes if i is not None)}"
        )
    transitions = verdict.get("transitions")
    if not isinstance(transitions, list):
        return problems + ["transitions must be a list"]
    pairs = {
        (t.get("from"), t.get("to"))
        for t in transitions
        if isinstance(t, dict)
    }
    expected_pairs = {(i, i + 1) for i in range(1, beat_count)}
    if beat_count > 1 and pairs != expected_pairs:
        problems.append(
            f"transition coverage mismatch: expected {sorted(expected_pairs)}, got {sorted(p for p in pairs if None not in p)}"
        )
    return problems


def _stamp_verdict(
    verdict: dict,
    packet: StoryGatePacket,
    judge_model: str,
) -> dict:
    stamped = dict(verdict)
    stamped["schema_version"] = SCHEMA_VERSION
    stamped["judge_model"] = judge_model
    stamped["prompt_version"] = PROMPT_VERSION
    stamped["board_id"] = packet.board_id
    stamped["source_sha256"] = packet.source_sha256
    stamped.setdefault("text_stageability", None)
    stamped.setdefault("panels", [])
    stamped.setdefault("transitions", [])
    return stamped


def _has_hard_failed_text_finding(text_stageability: dict) -> bool:
    for finding in text_stageability.get("findings", []):
        if not isinstance(finding, dict):
            continue
        if finding.get("passed") is False and finding.get("severity") == "HARD":
            return True
    return False


def _first_failed_text_reason(text_stageability: dict) -> str:
    for finding in text_stageability.get("findings", []):
        if not isinstance(finding, dict):
            continue
        if finding.get("passed") is False:
            reason = finding.get("reason")
            if isinstance(reason, str) and reason.strip():
                return reason
    return "text stageability hard failure"


def judge_unavailable_verdict(
    board_id: str,
    source_sha256: str,
    reason: str,
) -> dict:
    try:
        judge_model = get_model("board_critic", "qc")
    except Exception:
        judge_model = "unresolved"
    return {
        "schema_version": SCHEMA_VERSION,
        "judge_model": judge_model,
        "prompt_version": PROMPT_VERSION,
        "board_id": board_id,
        "source_sha256": source_sha256,
        "text_stageability": None,
        "panels": [],
        "transitions": [],
        "routing": {
            "class": "judge_unavailable",
            "confidence": 0.0,
            "evidence": reason,
        },
    }


def write_verdict(
    storyboards_dir: Path,
    board_stem: str,
    verdict: dict,
) -> tuple[Path, str]:
    """Write a verdict sidecar atomically and return its content hash."""
    path = Path(storyboards_dir) / f"{board_stem}.verdict.json"
    path.parent.mkdir(parents=True, exist_ok=True)
    body = json.dumps(verdict, indent=2, ensure_ascii=False).encode("utf-8")
    verdict_hash = hashlib.sha256(body).hexdigest()

    fd, tmp_path = tempfile.mkstemp(
        dir=path.parent,
        prefix=f".{path.name}.",
        suffix=".tmp",
    )
    try:
        with os.fdopen(fd, "wb") as handle:
            handle.write(body)
            try:
                handle.flush()
                os.fsync(handle.fileno())
            except OSError as exc:
                if exc.errno != errno.EINVAL:
                    raise
        os.replace(tmp_path, path)
    except Exception:
        try:
            os.unlink(tmp_path)
        except FileNotFoundError:
            pass
        raise

    return path, verdict_hash


def append_label(storyboards_dir: Path, row: dict) -> Path:
    """Append one human StoryGate label row to the project JSONL ledger."""
    path = Path(storyboards_dir) / "story_gate_labels.jsonl"
    jsonl_append_locked(path, row)
    return path


def verdict_summary(
    verdict: dict,
    *,
    mode: str,
    verdict_path: str,
    verdict_hash: str,
) -> dict:
    """Build the small Beat.board projection for a verdict sidecar."""
    routing = verdict.get("routing") or {}
    failed = _failed_findings(verdict)
    first_hard_reason = next(
        (
            reason
            for severity, reason in failed
            if severity == "HARD" and isinstance(reason, str) and reason.strip()
        ),
        None,
    )
    if any(severity == "HARD" for severity, _reason in failed):
        severity = "HARD"
    elif failed:
        severity = "SOFT"
    else:
        severity = "ok"

    return {
        "mode": mode,
        "route": routing.get("class"),
        "severity": severity,
        "confidence": routing.get("confidence"),
        "summary": first_hard_reason or "ok",
        "judge_model": verdict.get("judge_model"),
        "prompt_version": verdict.get("prompt_version"),
        "verdict_path": verdict_path,
        "verdict_hash": verdict_hash,
    }


def _failed_findings(verdict: dict) -> list[tuple[str | None, str | None]]:
    failed: list[tuple[str | None, str | None]] = []

    text_stageability = verdict.get("text_stageability")
    if isinstance(text_stageability, dict):
        for finding in text_stageability.get("findings", []) or []:
            if isinstance(finding, dict) and finding.get("passed") is False:
                failed.append((finding.get("severity"), finding.get("reason")))

    for section in ("panels", "transitions"):
        for item in verdict.get(section, []) or []:
            if not isinstance(item, dict):
                continue
            forced_checks = item.get("forced_checks") or {}
            if not isinstance(forced_checks, dict):
                continue
            for entry in forced_checks.values():
                if isinstance(entry, dict) and entry.get("passed") is False:
                    failed.append((entry.get("severity"), entry.get("reason")))

    return failed


def run_calibration(
    labels_path,
    projects_root,
    *,
    samples: int = 5,
    tiers: tuple[str, ...] = ("claude-opus-4-8", "claude-sonnet-4-6"),
    crops_modes: tuple[bool, ...] = (True, False),
) -> dict:
    """Run the offline StoryGate calibration suite against labeled boards."""
    labels_path = Path(labels_path)
    projects_root = Path(projects_root)
    if samples < 1:
        raise ValueError("samples must be >= 1")

    labels = json.loads(labels_path.read_text(encoding="utf-8"))
    project = labels["project"]
    episode = int(labels["episode"])
    project_root = projects_root / project
    storyboards_dir = project_root / "prep" / f"ep_{episode:03d}" / "storyboards"
    gate = StoryGate("shadow")

    board_paths = [
        storyboards_dir / filename
        for filename in labels.get("boards", [])
    ]
    cases = labels.get("cases", [])
    cases_by_path: dict[Path, list[tuple[int, dict]]] = {}
    for idx, case in enumerate(cases):
        case_path = project_root / case["artifact"]
        cases_by_path.setdefault(case_path, []).append((idx, case))
        if case_path not in board_paths:
            board_paths.append(case_path)

    board_results: dict[Path, dict[str, dict[bool, list[dict]]]] = {}
    text_results: dict[Path, dict[str, list[dict]]] = {}
    skipped_boards: dict[Path, str] = {}
    schema_valid = 0
    schema_total = 0
    judge_unavailable_count = 0

    for board_path in board_paths:
        sidecar_path = Path(f"{board_path}.json")
        if not board_path.is_file():
            skipped_boards[board_path] = f"missing PNG: {board_path}"
            continue
        if not sidecar_path.is_file():
            skipped_boards[board_path] = f"missing sidecar: {sidecar_path}"
            continue
        try:
            packet = StoryGatePacket.from_sidecar(board_path)
        except Exception as exc:  # noqa: BLE001 - report packet failures, do not crash
            skipped_boards[board_path] = f"packet error: {exc}"
            judge_unavailable_count += 1
            continue

        positive_script_cases = [
            case
            for _idx, case in cases_by_path.get(board_path, [])
            if _is_positive_script_stageability_case(case)
        ]
        text_sample_count = samples if positive_script_cases else 1
        text_results[board_path] = {}
        for tier in tiers:
            tier_text: list[dict] = []
            for sample_idx in range(text_sample_count):
                result = _calibration_call(
                    lambda tier=tier: gate.evaluate_text(packet, model=tier)
                )
                result["sample"] = sample_idx + 1
                tier_text.append(result)
                schema_total += 1
                if result.get("schema_valid"):
                    schema_valid += 1
                if result.get("judge_unavailable"):
                    judge_unavailable_count += 1
            text_results[board_path][tier] = tier_text

        board_results[board_path] = {}
        for tier in tiers:
            board_results[board_path][tier] = {}
            for crops_mode in crops_modes:
                verdicts: list[dict] = []
                for sample_idx in range(samples):
                    result = _calibration_call(
                        lambda tier=tier, crops_mode=crops_mode: gate.evaluate_board(
                            packet,
                            use_crops=crops_mode,
                            model=tier,
                        )
                    )
                    result["sample"] = sample_idx + 1
                    verdicts.append(result)
                    schema_total += 1
                    if result.get("schema_valid"):
                        schema_valid += 1
                    if result.get("judge_unavailable"):
                        judge_unavailable_count += 1
                board_results[board_path][tier][crops_mode] = verdicts

    per_case: list[dict] = []
    hard_flags_on_presumed_pass: list[dict] = []
    named_case_tier_total = 0
    named_case_tier_caught = 0
    by_tier: dict[str, dict] = {
        tier: {"named_total": 0, "named_caught": 0, "named_recall_or": 1.0}
        for tier in tiers
    }

    for case_idx, case in enumerate(cases):
        case_path = project_root / case["artifact"]
        for tier in tiers:
            tier_caught = False
            tier_entries: list[dict] = []
            for crops_mode in crops_modes:
                results = board_results.get(case_path, {}).get(tier, {}).get(crops_mode)
                entry = _calibration_case_entry(
                    case_idx=case_idx,
                    case=case,
                    tier=tier,
                    crops_mode=crops_mode,
                    results=results,
                    skip_reason=skipped_boards.get(case_path),
                )
                if _is_positive_script_stageability_case(case):
                    text_samples = text_results.get(case_path, {}).get(tier, [])
                    entry["text_stageability"] = {
                        "catch_vector": [
                            1 if _score_text_stageability_case(case, item.get("verdict")) else 0
                            for item in text_samples
                        ],
                        "caught_or": any(
                            _score_text_stageability_case(case, item.get("verdict"))
                            for item in text_samples
                        ),
                    }
                per_case.append(entry)
                tier_entries.append(entry)
                # A named case counts as caught if EITHER surface caught it:
                # the image verdict, or (for script-stageability cases) the
                # pre-gen text pass — the text pass IS the capability under
                # test for the CONT_007 class.
                text_caught = bool(
                    (entry.get("text_stageability") or {}).get("caught_or")
                )
                tier_caught = (
                    tier_caught or bool(entry.get("caught_or")) or text_caught
                )

                if case.get("expected") == "presumed_pass" and results:
                    for result in results:
                        verdict = result.get("verdict")
                        if isinstance(verdict, dict):
                            hard_flags_on_presumed_pass.extend(
                                _hard_flags_for_presumed_pass_case(
                                    case_idx,
                                    case,
                                    verdict,
                                    tier=tier,
                                    crops_mode=crops_mode,
                                    sample=result.get("sample"),
                                )
                            )

            if case.get("expected") == "fail":
                named_case_tier_total += 1
                by_tier[tier]["named_total"] += 1
                if tier_caught:
                    named_case_tier_caught += 1
                    by_tier[tier]["named_caught"] += 1

    for tier, stats in by_tier.items():
        total = stats["named_total"]
        stats["named_recall_or"] = (
            stats["named_caught"] / total
            if total
            else 1.0
        )

    schema_valid_rate = schema_valid / schema_total if schema_total else 1.0
    named_recall_or = (
        named_case_tier_caught / named_case_tier_total
        if named_case_tier_total
        else 1.0
    )

    return {
        "per_case": per_case,
        # Raw judge outputs per board — without these, diagnosing a miss
        # costs a fresh judge call per inspection.
        "verdicts": {
            str(board_path): {
                "text": {
                    tier: samples
                    for tier, samples in text_results.get(board_path, {}).items()
                },
                "image": {
                    tier: {
                        ("crops" if crops_mode else "no_crops"): samples
                        for crops_mode, samples in modes.items()
                    }
                    for tier, modes in board_results.get(board_path, {}).items()
                },
            }
            for board_path in board_paths
            if board_path in board_results or board_path in text_results
        },
        "aggregate": {
            "named_recall_or": named_recall_or,
            "hard_flags_on_presumed_pass": hard_flags_on_presumed_pass,
            "schema_valid_rate": schema_valid_rate,
            "judge_unavailable_count": judge_unavailable_count,
            "by_tier": by_tier,
            "skipped": [
                {"board": str(path), "reason": reason}
                for path, reason in sorted(skipped_boards.items(), key=lambda item: str(item[0]))
            ],
        },
        "config": {
            "samples": samples,
            "tiers": list(tiers),
            "crops_modes": list(crops_modes),
            "prompt_version": PROMPT_VERSION,
        },
    }


def _calibration_call(fn) -> dict:
    try:
        verdict = fn()
    except StoryGateJudgeUnavailable as exc:
        return {
            "verdict": None,
            "schema_valid": False,
            "judge_unavailable": True,
            "error": exc.reason,
        }
    problems = validate_verdict(verdict)
    route = (verdict.get("routing") or {}).get("class") if isinstance(verdict, dict) else None
    return {
        "verdict": verdict,
        "schema_valid": not problems,
        "schema_problems": problems,
        "judge_unavailable": route == "judge_unavailable",
    }


def _calibration_case_entry(
    *,
    case_idx: int,
    case: dict,
    tier: str,
    crops_mode: bool,
    results: list[dict] | None,
    skip_reason: str | None,
) -> dict:
    entry = {
        "case_index": case_idx,
        "artifact": case.get("artifact"),
        "locus": case.get("locus"),
        "expected": case.get("expected"),
        "expected_route_any": case.get("expected_route_any"),
        "expected_check": case.get("expected_check"),
        "reason": case.get("reason"),
        "tier": tier,
        "crops_mode": crops_mode,
        "catch_vector": [],
        "caught_or": False,
    }
    if skip_reason is not None:
        entry["skipped"] = True
        entry["skip_reason"] = skip_reason
        return entry
    if not results:
        entry["skipped"] = True
        entry["skip_reason"] = "board was not evaluated"
        return entry

    vector = [
        1 if _score_named_case(case, result.get("verdict")) else 0
        for result in results
    ]
    entry["catch_vector"] = vector
    entry["caught_or"] = any(vector)
    return entry


def _is_positive_script_stageability_case(case: dict) -> bool:
    return (
        case.get("expected") == "fail"
        and case.get("expected_check") == "causal_setup_present"
        and "script_problem" in (case.get("expected_route_any") or [])
    )


def _score_named_case(case: dict, verdict: dict | None) -> bool:
    if case.get("expected") != "fail" or not isinstance(verdict, dict):
        return False
    route_any = case.get("expected_route_any") or []
    route = (verdict.get("routing") or {}).get("class")
    if route not in route_any:
        return False
    expected_check = case.get("expected_check")
    locus = case.get("locus") or {}
    if "panel" in locus:
        return _panel_check_failed(verdict, int(locus["panel"]), expected_check)
    if "panels" in locus:
        return any(
            _panel_check_failed(verdict, int(panel), expected_check)
            for panel in locus.get("panels") or []
        )
    return _any_check_failed(verdict, expected_check)


def _score_text_stageability_case(case: dict, verdict: dict | None) -> bool:
    if not isinstance(verdict, dict):
        return False
    route = (verdict.get("routing") or {}).get("class")
    if route not in (case.get("expected_route_any") or []):
        return False
    expected_check = case.get("expected_check")
    text_stageability = verdict.get("text_stageability")
    if not isinstance(text_stageability, dict):
        return False
    locus = case.get("locus") or {}
    expected_beats: set[int] | None = None
    if "panel" in locus:
        expected_beats = {int(locus["panel"])}
    elif "panels" in locus:
        expected_beats = {int(panel) for panel in locus.get("panels") or []}
    for finding in text_stageability.get("findings", []) or []:
        if not isinstance(finding, dict):
            continue
        if (
            finding.get("check") == expected_check
            and finding.get("passed") is False
            and finding.get("severity") == "HARD"
        ):
            # The labeled locus must match: a HARD causal finding on the
            # WRONG beat is not a catch for this case (PR #76 gate r2).
            if expected_beats is not None:
                beat = finding.get("beat_index")
                if beat is None or int(beat) not in expected_beats:
                    continue
            return True
    return False


def _panel_check_failed(verdict: dict, panel_index: int, expected_check: str | None) -> bool:
    for panel in verdict.get("panels", []) or []:
        if not isinstance(panel, dict) or panel.get("index") != panel_index:
            continue
        checks = panel.get("forced_checks") or {}
        check = checks.get(expected_check)
        return isinstance(check, dict) and check.get("passed") is False
    return False


def _any_check_failed(verdict: dict, expected_check: str | None) -> bool:
    for section in ("panels", "transitions"):
        for item in verdict.get(section, []) or []:
            if not isinstance(item, dict):
                continue
            check = (item.get("forced_checks") or {}).get(expected_check)
            if isinstance(check, dict) and check.get("passed") is False:
                return True
    return False


def _hard_flags_for_presumed_pass_case(
    case_idx: int,
    case: dict,
    verdict: dict,
    *,
    tier: str,
    crops_mode: bool,
    sample: int | None,
) -> list[dict]:
    locus = case.get("locus") or {}
    panels = set(locus.get("panels") or [])
    if "panel" in locus:
        panels.add(locus["panel"])
    flags: list[dict] = []
    for panel in verdict.get("panels", []) or []:
        if not isinstance(panel, dict):
            continue
        panel_index = panel.get("index")
        if panel_index not in panels:
            continue
        checks = panel.get("forced_checks") or {}
        if not isinstance(checks, dict):
            continue
        for check_name, entry in checks.items():
            if (
                isinstance(entry, dict)
                and entry.get("passed") is False
                and entry.get("severity") == "HARD"
            ):
                flags.append(
                    {
                        "case_index": case_idx,
                        "artifact": case.get("artifact"),
                        "tier": tier,
                        "crops_mode": crops_mode,
                        "sample": sample,
                        "panel": panel_index,
                        "check": check_name,
                        "reason": entry.get("reason"),
                        "route": (verdict.get("routing") or {}).get("class"),
                    }
                )
    return flags
