"""Mention-ledger extraction for the breakdown layer.

The existing Stage-1 "Breakdown Pass" in ``ingest_pipeline`` writes
``global_bible.json``. This module is for the newer breakdown layer's S1
mention ledger: a Gate-A-ratified cache, not SSOT, used by the coverage
validator/proposal flow.

The LLM seam mirrors ``world_state_pass``: model resolution uses
``get_model("prose_author", "text")`` and the private call wrapper is thin
over ``anthropic.messages.create`` so tests can monkeypatch it.
"""

from __future__ import annotations

import json
import os
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from recoil.core.atomic_write import atomic_write_json
from recoil.core.model_profiles import get_model
from recoil.core.paths import ProjectPaths
from recoil.pipeline._lib.breakdown_scenes import (
    Scene,
    script_content_hash,
    segment_scenes,
)


LEDGER_SCHEMA_VERSION = 1
# Re-prompt the (non-deterministic) extraction model this many times before
# failing a scene loud — handles transient malformed mentions on complex prose
# (REC-158). Override via the RECOIL_BREAKDOWN_EXTRACT_ATTEMPTS env var; a
# malformed override falls back to the default rather than crashing import.
def _env_attempts(default: int = 3) -> int:
    try:
        return max(1, int(os.environ.get("RECOIL_BREAKDOWN_EXTRACT_ATTEMPTS", str(default))))
    except (TypeError, ValueError):
        return default


EXTRACT_MAX_ATTEMPTS = _env_attempts()
LEDGER_NOTE = (
    "Gate-A-ratified CACHE - never SSOT, never hand-edit; regenerate via "
    "breakdown_extract_cli"
)

BREAKDOWN_EXTRACT_SYSTEM = (
    "You are the S1 continuity mention extractor for a script breakdown "
    "ledger.\n"
    "Extract only explicit mentions in the supplied scene. Return strict JSON "
    "with a top-level `mentions` array and no prose.\n"
    "\n"
    "Every mention must include: kind, surface_text, span_quote. The caller "
    "will attach scene_id and scene_hash.\n"
    "Allowed kinds and required fields:\n"
    "- character: character_id\n"
    "- location: location_id\n"
    "- sublocation: location_id, sublocation\n"
    "- prop: prop_id\n"
    "- prop_state: prop_id, state_id\n"
    "- wardrobe_change: character_id, piece, change\n"
    "- transient_state: character_id, state_desc\n"
    "- identity_observation: character_id, attribute, observed_value\n"
    "\n"
    "Normalize ids against the provided vocabulary. If an explicit mention "
    "cannot be normalized, keep the surface text in the required id field. "
    "The span_quote must be a verbatim substring copied exactly from the scene "
    "text (a sentence or phrase is fine — it need not be a whole line)."
)


class BreakdownExtractError(RuntimeError):
    """Raised when S1 extraction cannot produce a valid mention ledger."""


REQUIRED_BY_KIND: dict[str, tuple[str, ...]] = {
    "character": ("character_id",),
    "location": ("location_id",),
    "sublocation": ("location_id", "sublocation"),
    "prop": ("prop_id",),
    "prop_state": ("prop_id", "state_id"),
    "wardrobe_change": ("character_id", "piece", "change"),
    "transient_state": ("character_id", "state_desc"),
    "identity_observation": ("character_id", "attribute", "observed_value"),
}

COMMON_REQUIRED = ("kind", "surface_text", "span_quote", "scene_id", "scene_hash")


def extract_mention_ledger(
    project: str,
    episode: int,
    *,
    model: str | None = None,
    write: bool = True,
) -> dict:
    """Extract or refresh the per-episode mention ledger.

    Unchanged scenes are carried forward from an existing ledger by
    ``scene_hash``. Changed/new scenes get exactly one LLM call each. When
    ``write`` is false, the same ledger is returned without creating dirs or
    writing files; this is the CLI dry-run path.
    """

    paths = ProjectPaths.for_project(project)
    script_path = paths.episodes_dir / f"ep_{episode:03d}.md"
    try:
        script_text = script_path.read_text(encoding="utf-8")
    except OSError as exc:
        raise BreakdownExtractError(f"cannot read script: {script_path}") from exc

    scenes = segment_scenes(script_text, episode)
    content_hash = script_content_hash(script_text)
    ledger_path = paths.episode_breakdown_dir(episode) / "mention_ledger.json"
    previous = _load_existing_ledger(ledger_path)
    carry_by_id, carry_by_hash = _carried_forward_lookups(previous)
    vocab = _load_bible_vocab(paths)
    model_id = model or get_model("prose_author", "text")

    ledger_scenes: list[dict[str, Any]] = []
    for scene in scenes:
        prior = carry_by_id.get(scene.scene_id)
        if not prior or prior.get("scene_hash") != scene.scene_hash:
            prior = carry_by_hash.get(scene.scene_hash)
        if prior and prior.get("scene_hash") == scene.scene_hash:
            mentions = [dict(m) for m in prior.get("mentions", [])]
            for mention in mentions:
                mention["scene_id"] = scene.scene_id
                mention["scene_hash"] = scene.scene_hash
                _validate_mention(mention, scene)
            ledger_scenes.append(
                {
                    "scene_id": scene.scene_id,
                    "scene_hash": scene.scene_hash,
                    "slugline": scene.slugline,
                    "mentions": mentions,
                    "carried_forward": True,
                }
            )
            continue

        prompt = _build_scene_prompt(scene, vocab)
        # Bounded retry on validation/parse failure: the extraction model is
        # non-deterministic and occasionally emits a malformed mention (e.g. a
        # missing surface_text or a non-grounded span_quote) on complex prose.
        # Re-prompting almost always yields a conforming pass. Still fail-loud
        # after EXTRACT_MAX_ATTEMPTS so a systematically un-extractable scene
        # surfaces rather than looping (REC-158).
        mentions = None
        last_err: BreakdownExtractError | None = None
        for _attempt in range(EXTRACT_MAX_ATTEMPTS):
            try:
                raw = _call_extraction_model(model_id, BREAKDOWN_EXTRACT_SYSTEM, prompt)
                mentions = _parse_mentions(raw, scene)
                break
            except BreakdownExtractError as exc:
                last_err = exc
                continue
            except Exception as exc:  # noqa: BLE001 - fail-loud operator pass
                raise BreakdownExtractError(
                    f"mention extraction failed for {scene.scene_id}: {type(exc).__name__}"
                ) from exc
        if mentions is None:
            raise BreakdownExtractError(
                f"mention extraction failed for {scene.scene_id} after "
                f"{EXTRACT_MAX_ATTEMPTS} attempts: {last_err}"
            ) from last_err

        ledger_scenes.append(
            {
                "scene_id": scene.scene_id,
                "scene_hash": scene.scene_hash,
                "slugline": scene.slugline,
                "mentions": mentions,
                "carried_forward": False,
            }
        )

    ledger = {
        "schema_version": LEDGER_SCHEMA_VERSION,
        "_meta": {"note": LEDGER_NOTE},
        "project": project,
        "episode": episode,
        "script_content_hash": content_hash,
        "generated_at": datetime.now(UTC).isoformat(),
        "scenes": ledger_scenes,
    }
    _validate_ledger_shape(ledger)

    if write:
        atomic_write_json(ledger_path, ledger, indent=2)
    return ledger


def _load_existing_ledger(path: Path) -> dict | None:
    if not path.is_file():
        return None
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError) as exc:
        raise BreakdownExtractError(f"cannot read existing ledger: {path}") from exc
    if not isinstance(data, dict):
        raise BreakdownExtractError(f"existing ledger is not a JSON object: {path}")
    return data


def _carried_forward_lookups(ledger: dict | None) -> tuple[dict[str, dict], dict[str, dict]]:
    if not ledger:
        return {}, {}
    scenes = ledger.get("scenes", [])
    if not isinstance(scenes, list):
        raise BreakdownExtractError("existing ledger scenes must be a list")
    by_id: dict[str, dict] = {}
    by_hash_all: dict[str, list[dict]] = {}
    for scene in scenes:
        if not isinstance(scene, dict):
            continue
        if scene.get("scene_id"):
            by_id[str(scene["scene_id"])] = scene
        if scene.get("scene_hash"):
            by_hash_all.setdefault(str(scene["scene_hash"]), []).append(scene)
    by_hash = {
        scene_hash: matches[0]
        for scene_hash, matches in by_hash_all.items()
        if len(matches) == 1
    }
    return by_id, by_hash


def _load_bible_vocab(paths: ProjectPaths) -> dict[str, Any]:
    bible: dict[str, Any] = {}
    if paths.global_bible_path.is_file():
        try:
            loaded = json.loads(paths.global_bible_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError) as exc:
            raise BreakdownExtractError(f"cannot read bible: {paths.global_bible_path}") from exc
        if isinstance(loaded, dict):
            bible = loaded

    characters = _keys(bible.get("characters"))
    locations = _keys(bible.get("locations"))
    props = _keys(bible.get("props"))
    sublocations: dict[str, list[str]] = {}
    for location_id, location in (bible.get("locations") or {}).items():
        if isinstance(location, dict):
            sublocations[str(location_id)] = _keys(location.get("sublocations"))

    return {
        "characters": characters,
        "locations": locations,
        "props": props,
        "sublocations": sublocations,
    }


def _keys(value: Any) -> list[str]:
    if isinstance(value, dict):
        return sorted(str(key) for key in value.keys())
    return []


def _build_scene_prompt(scene: Scene, vocab: dict[str, Any]) -> str:
    payload = {
        "scene_id": scene.scene_id,
        "scene_hash": scene.scene_hash,
        "slugline": scene.slugline,
        "normalization_vocabulary": vocab,
        "scene_text": scene.span_text,
        "instructions": (
            "Return JSON only: {\"mentions\": [...]}. Include every explicit "
            "character, location, sublocation, prop, prop-state, wardrobe "
            "change, transient state, and identity-bearing observation."
        ),
    }
    return json.dumps(payload, ensure_ascii=True, indent=2)


def _parse_mentions(raw: str, scene: Scene) -> list[dict[str, Any]]:
    try:
        parsed = json.loads(_strip_json_fence(raw))
    except json.JSONDecodeError as exc:
        raise BreakdownExtractError(f"invalid JSON from extractor for {scene.scene_id}") from exc

    if isinstance(parsed, dict):
        mentions_raw = parsed.get("mentions")
    else:
        mentions_raw = parsed
    if mentions_raw is None:
        mentions_raw = []
    if not isinstance(mentions_raw, list):
        raise BreakdownExtractError(f"extractor mentions must be a list for {scene.scene_id}")

    mentions: list[dict[str, Any]] = []
    for index, mention in enumerate(mentions_raw):
        if not isinstance(mention, dict):
            raise BreakdownExtractError(
                f"mention {index} for {scene.scene_id} is not a JSON object"
            )
        normalized = dict(mention)
        normalized["scene_id"] = scene.scene_id
        normalized["scene_hash"] = scene.scene_hash
        _validate_mention(normalized, scene)
        mentions.append(normalized)
    return mentions


def _strip_json_fence(raw: str) -> str:
    text = (raw or "").strip()
    if text.startswith("```"):
        lines = text.splitlines()
        if lines and lines[0].startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        text = "\n".join(lines).strip()
    return text


def _validate_ledger_shape(ledger: dict) -> None:
    scenes = ledger.get("scenes")
    if not isinstance(scenes, list):
        raise BreakdownExtractError("ledger scenes must be a list")
    for scene in scenes:
        if not isinstance(scene, dict):
            raise BreakdownExtractError("ledger scene entries must be objects")
        for key in ("scene_id", "scene_hash", "slugline", "mentions", "carried_forward"):
            if key not in scene:
                raise BreakdownExtractError(f"ledger scene missing {key}")
        if not isinstance(scene["mentions"], list):
            raise BreakdownExtractError("ledger scene mentions must be a list")


def _validate_mention(mention: dict[str, Any], scene: Scene) -> None:
    for key in COMMON_REQUIRED:
        if not mention.get(key):
            raise BreakdownExtractError(f"mention missing required field {key}")

    kind = mention.get("kind")
    required = REQUIRED_BY_KIND.get(str(kind))
    if not required:
        raise BreakdownExtractError(f"unsupported mention kind {kind!r}")
    for key in required:
        if not mention.get(key):
            raise BreakdownExtractError(f"{kind} mention missing required field {key}")

    if mention.get("scene_id") != scene.scene_id:
        raise BreakdownExtractError(f"mention scene_id mismatch for {scene.scene_id}")
    if mention.get("scene_hash") != scene.scene_hash:
        raise BreakdownExtractError(f"mention scene_hash mismatch for {scene.scene_id}")

    # span_quote must be a verbatim SUBSTRING of the scene text — not necessarily a
    # full newline-split line. Action prose has long multi-sentence lines, and the
    # model naturally quotes a sentence/sub-span; requiring a full-line match
    # spuriously rejects valid grounded quotes (REC-158). Whitespace-normalized so
    # line-wrapping inside the quote doesn't matter.
    quote = str(mention.get("span_quote", "")).strip()
    norm_scene = " ".join(scene.span_text.split())
    norm_quote = " ".join(quote.split())
    if not norm_quote or norm_quote not in norm_scene:
        raise BreakdownExtractError(
            f"span_quote is not a verbatim substring of the scene for "
            f"{scene.scene_id}: {quote!r}"
        )


def _call_extraction_model(model: str, system_prompt: str, user_prompt: str) -> str:
    """LLM call seam — monkeypatched in tests."""

    from recoil.core.claude_cli import claude_transport

    if claude_transport() == "cli":
        from recoil.core.claude_cli import claude_cli_call

        return claude_cli_call(user_prompt, system_prompt=system_prompt, model=model)

    from recoil.core.anthropic_client import anthropic_client

    client = anthropic_client()
    response = client.messages.create(
        model=model,
        max_tokens=4096,
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}],
    )
    return response.content[0].text


__all__ = [
    "BREAKDOWN_EXTRACT_SYSTEM",
    "BreakdownExtractError",
    "extract_mention_ledger",
]