# ==============================================================================
# PORTED FROM STARSEND: lib/critic.py
# DATE: 2026-03-29
# NOTE: For historical git blame prior to this date, see the starsend repository.
# ==============================================================================
"""
critic.py — CriticLoop base class + shared dataclasses.

Every critic in the pipeline inherits from CriticLoop.
Key contract: run() NEVER raises. On any error it returns
(original_artifact, CriticResult with passed=True) so the
pipeline continues unblocked.

Experience pool: append-only JSONL logging for future strategy agent.
"""

import json
import logging
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Any, Optional

logger = logging.getLogger(__name__)


class Outcome(Enum):
    """Ternary outcome distinct from passed: True/False.

    PASS:  check completed and met all hard criteria
    FAIL:  check completed and failed at least one hard criterion
    ERROR: check could not complete (vision API down, missing dep, crash)

    Callers MUST check `result.outcome` (or use `result.errored`) to route
    ERROR results to the review queue. Treating ERROR as PASS is silent
    QC bypass; treating ERROR as FAIL is hostile to PC-1 keep-bias.
    """

    PASS = "pass"
    FAIL = "fail"
    ERROR = "error"


class Severity(Enum):
    HARD = "hard"  # Blocks pipeline if failed (auto-fix attempted)
    SOFT = "soft"  # Advisory only — logged but never blocks


class FailureMode(str, Enum):
    """Typed failure classification for critic dimensions.

    When a Dimension fails, failure_mode identifies the specific failure
    type for action routing in run_shot.py. Eliminates string-matching
    on Dimension.message for routing decisions.

    String values are stable -- they appear in ops.log.jsonl and
    review_queue.jsonl.
    """

    NONE = "none"
    ANATOMY_FACE_MERGE = "anatomy_face_merge"
    ANATOMY_LIMB_MISCOUNT = "anatomy_limb_miscount"
    IDENTITY_DRIFT = "identity_drift"
    BACKGROUND_CONTAMINATION = "background_contamination"
    WARDROBE_MISMATCH = "wardrobe_mismatch"
    LIGHTING_MISMATCH = "lighting_mismatch"
    GRID_INFLUENCE = "grid_influence"
    SAFETY_SOFTENED = "safety_softened"
    UNKNOWN = "unknown"
    MOTION_FAILURE = "motion_failure"
    END_FRAME_DRIFT = "end_frame_drift"
    CONTENT_FILTER_HARD_BLOCK = "content_filter_hard_block"
    REF_BLEED = "ref_bleed"
    AUDIO_SYNC_DRIFT = "audio_sync_drift"
    COVERAGE_GEOMETRY_BROKEN = "coverage_geometry_broken"

    # Pass-level modes added 2026-04-15 for retry-strategy library
    COMPOSITION_WRONG = "composition_wrong"
    STYLE_DRIFT = "style_drift"
    CUTS_TOO_SOFT = "cuts_too_soft"
    PROMPT_DURATION_MISMATCH = "prompt_duration_mismatch"
    COST_OVERRUN = "cost_overrun"
    TRANSIENT = "transient"
    GATE_MECHANICAL = "gate_mechanical"


@dataclass
class Dimension:
    """A single quality dimension checked by a critic."""

    name: str
    severity: Severity
    passed: bool = True
    message: str = ""
    auto_fixed: bool = False
    failure_mode: FailureMode | None = None


@dataclass
class CriticResult:
    """Aggregate result from a critic run."""

    critic_name: str
    outcome: Outcome = Outcome.PASS
    attempt: int = 1
    dimensions: list[Dimension] = field(default_factory=list)
    auto_fixed: bool = False
    cost: float = 0.0
    elapsed_ms: float = 0.0
    error: str = ""

    @property
    def passed(self) -> bool:
        """Legacy compat: True only on Outcome.PASS. ERROR is NOT passing.

        Note: this is a read-only property. Callers that previously did
        `result.passed = True` must use `result.outcome = Outcome.PASS`.
        """
        return self.outcome == Outcome.PASS

    @property
    def errored(self) -> bool:
        return self.outcome == Outcome.ERROR

    @property
    def failed_dimensions(self) -> list[Dimension]:
        return [d for d in self.dimensions if not d.passed]

    @property
    def hard_failures(self) -> list[Dimension]:
        return [
            d for d in self.dimensions if not d.passed and d.severity == Severity.HARD
        ]

    @property
    def soft_failures(self) -> list[Dimension]:
        return [
            d for d in self.dimensions if not d.passed and d.severity == Severity.SOFT
        ]

    @property
    def dominant_failure_mode(self) -> FailureMode | None:
        """Return the failure_mode of the first hard failure, or None."""
        for d in self.dimensions:
            if (
                not d.passed
                and d.severity == Severity.HARD
                and d.failure_mode is not None
            ):
                return d.failure_mode
        for d in self.dimensions:
            if not d.passed and d.failure_mode is not None:
                return d.failure_mode
        return None

    def to_dict(self) -> dict:
        # asdict() does not serialize @property values, so we inject them manually.
        d = asdict(self)
        d["outcome"] = (
            d["outcome"].value if isinstance(d["outcome"], Outcome) else d["outcome"]
        )
        d["passed"] = self.passed  # @property — inject explicitly
        d["errored"] = self.errored  # @property — inject explicitly
        for dim in d["dimensions"]:
            dim["severity"] = (
                dim["severity"].value
                if isinstance(dim["severity"], Severity)
                else dim["severity"]
            )
            if dim.get("failure_mode") is not None:
                dim["failure_mode"] = (
                    dim["failure_mode"].value
                    if isinstance(dim["failure_mode"], FailureMode)
                    else dim["failure_mode"]
                )
        return d


class CriticLoop:
    """Base class for all pipeline critics.

    Subclasses implement:
        - evaluate(artifact) -> list[Dimension]
        - auto_fix(artifact, failed_dims) -> artifact  (optional)

    Usage:
        critic = MySubclassCritic(max_attempts=2, ...)
        artifact, result = critic.run(artifact, context)
    """

    def __init__(
        self,
        name: str,
        max_attempts: int = 1,
        experience_pool_dir: Optional[Path] = None,
        shot_id: str = "",
    ):
        self.name = name
        self.max_attempts = max_attempts
        self.experience_pool_dir = experience_pool_dir
        self.shot_id = shot_id

    def evaluate(self, artifact: Any, context: dict) -> list[Dimension]:
        """Evaluate artifact against all dimensions. Override in subclass."""
        raise NotImplementedError

    def auto_fix(
        self, artifact: Any, failed_dims: list[Dimension], context: dict
    ) -> Any:
        """Attempt to auto-fix the artifact. Override in subclass.

        Returns modified artifact. Default: return artifact unchanged.
        """
        return artifact

    def run(
        self, artifact: Any, context: dict | None = None
    ) -> tuple[Any, CriticResult]:
        """Execute the critic loop with retry + regression detection.

        Returns (artifact, CriticResult). NEVER raises.
        On any error: returns (original_artifact, passing CriticResult with error message).
        """
        context = context or {}
        original_artifact = artifact
        best_artifact = artifact
        best_result: CriticResult | None = None
        any_auto_fix = False  # Track if any auto-fix was applied across attempts
        t0 = time.monotonic()

        try:
            for attempt in range(1, self.max_attempts + 1):
                dims = self.evaluate(artifact, context)
                result = CriticResult(
                    critic_name=self.name,
                    attempt=attempt,
                    dimensions=dims,
                )

                hard_fails = result.hard_failures

                # Check for pass (all HARD dimensions pass)
                if not hard_fails:
                    result.outcome = Outcome.PASS
                    result.auto_fixed = any_auto_fix
                    result.elapsed_ms = (time.monotonic() - t0) * 1000
                    self._log_experience(result)
                    return artifact, result

                # ── Regression detection ──────────────────────────
                # If this is a retry and a previously-passing dimension
                # now fails, stop and return the earlier (better) artifact.
                if best_result is not None:
                    prev_passing = {d.name for d in best_result.dimensions if d.passed}
                    now_failing = {d.name for d in dims if not d.passed}
                    regressions = prev_passing & now_failing
                    if regressions:
                        logger.warning(
                            "%s: Regression detected on attempt %d — "
                            "dimensions %s regressed. Returning earlier artifact.",
                            self.name,
                            attempt,
                            regressions,
                        )
                        best_result.elapsed_ms = (time.monotonic() - t0) * 1000
                        best_result.outcome = (
                            Outcome.PASS
                            if not best_result.hard_failures
                            else Outcome.FAIL
                        )
                        self._log_experience(best_result)
                        return best_artifact, best_result

                # Save current as best before attempting fix
                best_artifact = artifact
                best_result = result

                # ── Auto-fix attempt ──────────────────────────────
                artifact = self.auto_fix(artifact, hard_fails, context)
                result.auto_fixed = True
                any_auto_fix = True

                # If this was the last attempt, evaluate once more after fix
                if attempt == self.max_attempts:
                    dims = self.evaluate(artifact, context)
                    result = CriticResult(
                        critic_name=self.name,
                        attempt=attempt,
                        dimensions=dims,
                        auto_fixed=True,
                    )
                    # Regression check on final evaluation
                    if best_result is not None:
                        prev_passing = {
                            d.name for d in best_result.dimensions if d.passed
                        }
                        now_failing = {d.name for d in dims if not d.passed}
                        regressions = prev_passing & now_failing
                        if regressions:
                            logger.warning(
                                "%s: Regression on final auto-fix eval — "
                                "returning pre-fix artifact.",
                                self.name,
                            )
                            best_result.elapsed_ms = (time.monotonic() - t0) * 1000
                            best_result.outcome = (
                                Outcome.PASS
                                if not best_result.hard_failures
                                else Outcome.FAIL
                            )
                            self._log_experience(best_result)
                            return best_artifact, best_result

                    result.outcome = (
                        Outcome.PASS if not result.hard_failures else Outcome.FAIL
                    )
                    result.elapsed_ms = (time.monotonic() - t0) * 1000
                    self._log_experience(result)
                    return artifact, result

        except Exception as e:
            logger.error("%s: Unhandled error — ERROR outcome: %s", self.name, e)
            error_result = CriticResult(
                critic_name=self.name,
                outcome=Outcome.ERROR,
                error=str(e),
                elapsed_ms=(time.monotonic() - t0) * 1000,
                dimensions=[
                    Dimension(
                        name="SYSTEM_ERROR",
                        severity=Severity.HARD,
                        passed=False,
                        message=str(e),
                    )
                ],
            )
            self._log_experience(error_result)
            return original_artifact, error_result

    def _log_experience(self, result: CriticResult) -> None:
        """Append entry to experience pool JSONL. fcntl.flock-protected; no truncation.

        fcntl.flock is cross-process — safe for both threading and ProcessPoolExecutor
        parallelism. Truncation removed (Opus + Gemini audit, 2026-04-09): the prior
        cap-at-500 was a race condition + data loss hazard, and JSONL at 10K lines is
        only ~2 MB which is not a concern. Periodic compaction can be a separate job.
        """
        if not self.experience_pool_dir:
            return
        try:
            import fcntl  # POSIX only — macOS + Linux. Recoil targets these.

            pool_path = self.experience_pool_dir / "experience_pool.jsonl"
            pool_path.parent.mkdir(parents=True, exist_ok=True)

            entry = {
                "ts": datetime.now(timezone.utc).isoformat(),
                "critic": self.name,
                "shot_id": self.shot_id,
                "passed": result.passed,
                "attempt": result.attempt,
                "failures": [d.name for d in result.failed_dimensions],
                "failure_modes": [
                    d.failure_mode.value
                    for d in result.failed_dimensions
                    if d.failure_mode is not None
                ],
                "auto_fixed": result.auto_fixed,
                "final_passed": result.passed,
                "cost": result.cost,
                "error": result.error or None,
            }

            # Append with exclusive file lock — cross-process safe
            with open(pool_path, "a", encoding="utf-8") as f:
                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
                try:
                    f.write(json.dumps(entry) + "\n")
                    f.flush()
                finally:
                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
        except Exception as e:
            logger.debug("Experience pool write failed: %s", e)


# ──────────────────────────────────────────────────────────────────────
# CP-9 Phase 7 — LegacyFlashCriticEvalNode adapter
#
# Wraps any concrete `CriticLoop` instance (StartFrameCritic, VideoFrameCritic,
# RefImageCritic, etc.) as an `EvalNode` satisfying the Phase 3 Protocol:
# `judge_id: str`, `model_used: str`, `evaluate(context) -> EvalResult`.
#
# The legacy `CriticLoop` body is UNTOUCHED — this adapter is a thin shim
# that calls `instance.run(artifact, context)` and translates the
# `CriticResult` outcome (PASS / FAIL / ERROR) + dimensions into an
# `EvalResult`:
#     PASS  → score 1.0
#     FAIL  → score 0.0
#     ERROR → score 0.0
#
# Joined `dimension.reason` (falls back to `dimension.message`) lands in
# `EvalResult.reasoning`. `cost_usd=0.0` because `CriticResult` does not
# expose a per-call cost in its dataclass (it has a `cost` field but it's
# not populated from inside the legacy run loop — adapter conservatively
# reports 0.0 so PanelOfJudges cost-cap accounting isn't double-counted).
#
# Production callers of the legacy critic are NOT migrated in CP-9 — this
# adapter exists so a CP-9 PanelOfJudges can include a legacy critic as one
# judge alongside Gemini 3.1 Pro judges. Production migration is a CP-N+
# task gated on JT review.
#
# Per Phase 1 audit § 12f corrections (LOCKED): adapter goes here at the
# bottom of `recoil/core/critic.py`. Engine-fix-phase-D Phase 4 removed
# the legacy `pipeline/lib/critic.py` wildcard-shim proxy — callers now
# import `LegacyFlashCriticEvalNode` directly from `core.critic` (this
# module's __all__).
# ──────────────────────────────────────────────────────────────────────


class LegacyFlashCriticEvalNode:
    """EvalNode adapter wrapping a legacy `CriticLoop` instance.

    Per Phase 1 audit § 12f items 2-3 (LOCKED): the adapter takes a
    `CriticLoop` instance in its constructor (any concrete subclass) and
    calls `instance.run(artifact, context)` — there is NO free-function
    `run_visual_critic` in the legacy critic module. The Phase 3
    `EvalNode` Protocol is satisfied by the three attributes
    `judge_id` / `model_used` / `evaluate(context)`.

    Outcome → score mapping (per § 12f R3 LOCKED):
        Outcome.PASS  → 1.0
        Outcome.FAIL  → 0.0
        Outcome.ERROR → 0.0
        (any future Outcome member) → 0.0 (silent default; PC-1 keep-bias)

    `cost_usd` is hardcoded `0.0` because `CriticResult` does not expose a
    cost field. The wrapped critic is responsible for any cost tracking;
    the adapter does not modify or wrap it.

    `judge_id` defaults to `"legacy_flash_critic_v1"`. `model_used` is
    introspected lazily from the wrapped critic instance on each
    `evaluate` call (falls back to `"unknown"` at __init__, then sticks to
    the last-seen value if the critic later loses its `model_id`).

    Adapter does NOT modify the wrapped critic in any way — the wrapped
    instance is treated as opaque and read-only after construction.
    """

    def __init__(
        self,
        critic_instance: "CriticLoop",
        judge_id: str = "legacy_flash_critic_v1",
    ) -> None:
        self.critic = critic_instance
        self.judge_id = judge_id
        # EvalNode Protocol requires a `model_used: str` attribute. Seed it
        # from the wrapped critic if available; refreshed lazily on each
        # evaluate() so a critic that mutates its model_id mid-run is
        # reflected accurately in the scorecard.
        self.model_used: str = str(getattr(critic_instance, "model_id", "unknown"))

    def evaluate(self, context: "EvalContext") -> "EvalResult":  # noqa: F821
        # Defer the eval-module import to call time. Importing
        # `pipeline.core.eval` at module load would circle back through
        # the pipeline package while `core.critic` is still resolving for
        # callers that import critic before eval is initialised.
        from recoil.pipeline.core.eval import EvalResult

        # Pass-through: legacy critic expects a dict, but EvalContext carries
        # typed first-class fields (prompt / rubric / judge_id / target_take)
        # plus arbitrary metadata. Merge so wrapped critics can read either
        # interface — explicit keys win over metadata keys with the same name.
        legacy_context = {
            **(context.metadata or {}),
            "prompt": context.prompt,
            "rubric": context.rubric,
            "judge_id": context.judge_id,
            "target_take": context.target_take,
        }
        artifact, result = self.critic.run(
            artifact=str(context.target_artifact_path),
            context=legacy_context,
        )
        score = 1.0 if result.outcome == Outcome.PASS else 0.0
        # Prefer .reason if present (newer Dimension shape may add it);
        # fall back to .message which is the documented field today.
        reasoning_parts = []
        for d in result.dimensions:
            text = getattr(d, "reason", None) or getattr(d, "message", "") or ""
            reasoning_parts.append(text)
        reasoning = " | ".join(reasoning_parts)

        # Refresh model_used from the wrapped critic in case it mutated.
        self.model_used = str(getattr(self.critic, "model_id", self.model_used))

        return EvalResult(
            score=score,
            reasoning=reasoning,
            judge_id=self.judge_id,
            model_used=self.model_used,
            cost_usd=0.0,  # CriticResult.cost is not reliably populated
            metadata={
                "outcome": result.outcome.value,
                "dimensions": [d.name for d in result.dimensions],
            },
        )


__all__ = [
    # Public symbols (Phase D — MF-3 + DEBT-9).
    # Enums.
    "FailureMode",
    "Outcome",
    "Severity",
    # Dataclasses / result types.
    "CriticResult",
    "Dimension",
    # Base critic loop + adapters.
    "CriticLoop",
    "LegacyFlashCriticEvalNode",
]