"""Deterministic scene segmentation for the breakdown layer.

The existing Stage-1 "Breakdown Pass" in ``ingest_pipeline`` writes a
different artifact (``global_bible.json``). This module is for the newer
breakdown ledger/coverage layer and deliberately performs no LLM calls.

``scene_index`` values in ``camera_tested/*.json`` are LLM-assigned and may
disagree with this deterministic ordinal. Breakdown ledgers key on
``scene_id`` from this segmenter; no bridge exists yet.

Scene hashes are sha256 hex digests over a normalized span where trailing
whitespace is stripped from each line. Full script content hashes are sha256
hex digests over the raw file text.
"""

from __future__ import annotations

import hashlib
import re
from dataclasses import dataclass

from recoil.pipeline._lib.episode_script import strip_metadata


@dataclass(frozen=True)
class Scene:
    scene_id: str
    slugline: str
    span_text: str
    scene_hash: str


def _strip_metadata(text: str) -> str:
    """Delegate to the canonical episode-script normalizer."""
    return strip_metadata(text)


def _is_slugline(line: str) -> bool:
    stripped = line.strip()
    return stripped.startswith("INT.") or stripped.startswith("EXT.")


def _is_beat_header(line: str) -> bool:
    return re.match(r"^#\s*\[\d+:\d+", line.strip()) is not None


def _split_trailing_beat_prefix(lines: list[str]) -> tuple[list[str], list[str]]:
    """Move a beat header immediately before a slugline to that next span."""
    end = len(lines)
    i = end
    while i > 0 and not lines[i - 1].strip():
        i -= 1
    if i == 0 or not _is_beat_header(lines[i - 1]):
        return lines, []

    start = i - 1
    while start > 0:
        j = start
        while j > 0 and not lines[j - 1].strip():
            j -= 1
        if j > 0 and _is_beat_header(lines[j - 1]):
            start = j - 1
            continue
        break

    return lines[:start], lines[start:end]


def _normalize_span(span_text: str) -> str:
    return "\n".join(line.rstrip() for line in span_text.splitlines())


def scene_hash(span_text: str) -> str:
    """Return sha256 over scene span text with trailing whitespace per line stripped."""
    normalized = _normalize_span(span_text)
    return hashlib.sha256(normalized.encode("utf-8")).hexdigest()


def script_content_hash(script_text: str) -> str:
    """Return sha256 over the full raw script text."""
    return hashlib.sha256(script_text.encode("utf-8")).hexdigest()


def segment_scenes(script_text: str, episode: int) -> list[Scene]:
    """Split a markdown episode script into deterministic slugline-led scenes."""
    stripped = _strip_metadata(script_text)
    if not stripped:
        return []

    scenes: list[Scene] = []
    current_slugline: str | None = None
    current_lines: list[str] = []
    leading_lines: list[str] = []

    for line in stripped.splitlines():
        if _is_slugline(line):
            if current_slugline is not None:
                current_lines, prefix = _split_trailing_beat_prefix(current_lines)
                scenes.append(
                    _build_scene(episode, len(scenes) + 1, current_slugline, current_lines)
                )
            else:
                prefix = leading_lines
            current_slugline = line.strip()
            current_lines = [*prefix, line]
        elif current_slugline is not None:
            current_lines.append(line)
        else:
            leading_lines.append(line)

    if current_slugline is not None:
        scenes.append(_build_scene(episode, len(scenes) + 1, current_slugline, current_lines))

    return scenes


def _build_scene(episode: int, ordinal: int, slugline: str, lines: list[str]) -> Scene:
    span_text = "\n".join(lines).strip()
    return Scene(
        scene_id=f"EP{episode:03d}_SC{ordinal:03d}",
        slugline=slugline,
        span_text=span_text,
        scene_hash=scene_hash(span_text),
    )


__all__ = [
    "Scene",
    "scene_hash",
    "script_content_hash",
    "segment_scenes",
]
