#!/usr/bin/env python3
"""reclaim_orphans.py — retroactively attach synthetic provenance to orphan videos.

Usage:
    python3 pipeline/tools/reclaim_orphans.py --project driver-beware --dry-run
    python3 pipeline/tools/reclaim_orphans.py --project driver-beware --apply --confidence-min medium
    python3 pipeline/tools/reclaim_orphans.py --project driver-beware --apply --only 'REGEN_P02_*' --confidence-min low

Walks {projects_root()}/{project}/output/video/*/.../_orphans/, for each .mp4 builds
a synthetic {shot_id}_meta.yaml, and (on --apply) moves files out of _orphans/
back to the parent episode directory.

Confidence:
    high   — orphan had a populated pipeline sidecar (.mp4.json with model+prompt+cost)
    medium — orphan had filename we can parse + ffprobe success
    low    — fallback: filename only, no sidecar, no ffprobe data

Per PLAN §8 and SYNTHESIS §B — synthetic provenance is always marked
`reclaim.synthetic: true` so downstream tools can distinguish reclaimed from
authentic records.
"""

from __future__ import annotations

import argparse
import fnmatch
import json
import os
import re
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# ── Path setup ──────────────────────────────────────────────────
_RECOIL_ROOT = Path(__file__).resolve().parent.parent.parent
if str(_RECOIL_ROOT) not in sys.path:
    sys.path.insert(0, str(_RECOIL_ROOT))

from recoil.core.paths import projects_root, ProjectPaths  # type: ignore


CONFIDENCE_RANK = {"low": 0, "medium": 1, "high": 2}


# ── Filename parsing ────────────────────────────────────────────

_TAKE_RE = re.compile(r"_take(\d+)$", re.IGNORECASE)
_VERSION_RE = re.compile(r"_v(\d+)$", re.IGNORECASE)
_MODEL_HINTS_RE = re.compile(r"_(seedance|kling|veo|nbp|gemini)$", re.IGNORECASE)
_PIPELINE_HINTS_RE = re.compile(r"_(i2v|t2v|r2v|multishot|multi_shot|beat_by_beat|full_beat|stills|loose|low_angle_jump|lowangle_to_profile)(?=_|$)", re.IGNORECASE)


def parse_orphan_filename(stem: str) -> dict:
    """Extract shot_id, take_number, and pipeline/model hints from a stem.

    Returns: {shot_id, take_number, hints: {pipeline, model, versioned}}
    """
    s = stem
    hints = {"pipeline": None, "model": None, "versioned": False}

    # Strip leading "shot_" prefix if present
    if s.startswith("shot_"):
        s = s[len("shot_"):]

    # Iteratively strip trailing TAKE / VERSION / MODEL suffixes in any
    # order until none match. The original ordering (TAKE → VERSION → MODEL)
    # mis-parsed filenames like "shot_123_take1_v2_kling" because the take
    # regex anchors on `$` and silently fell back to take=1, leaving the
    # take/version/model tokens stranded in the shot_id. Loop handles every
    # ordering deterministically.
    take_number = 1
    while True:
        m = _TAKE_RE.search(s)
        if m:
            take_number = int(m.group(1))
            s = s[: m.start()]
            continue
        vm = _VERSION_RE.search(s)
        if vm:
            hints["versioned"] = True
            s = s[: vm.start()]
            continue
        mm = _MODEL_HINTS_RE.search(s)
        if mm:
            hints["model"] = mm.group(1).lower()
            s = s[: mm.start()]
            continue
        break

    # Pipeline hint: keep inside the shot_id for readability, but flag it
    pm = _PIPELINE_HINTS_RE.search(s)
    if pm:
        hints["pipeline"] = pm.group(1).lower()

    shot_id = s.strip("_")
    return {"shot_id": shot_id, "take_number": take_number, "hints": hints}


# ── ffprobe fallback ────────────────────────────────────────────

def run_ffprobe(video: Path) -> dict:
    """Return {duration: int_seconds, aspect_ratio: 'W:H', width, height} or {}."""
    try:
        out = subprocess.check_output(
            [
                "ffprobe", "-v", "error",
                "-select_streams", "v:0",
                "-show_entries", "stream=width,height,duration,display_aspect_ratio",
                "-of", "json",
                str(video),
            ],
            stderr=subprocess.DEVNULL,
            timeout=10,
        )
        data = json.loads(out)
        stream = (data.get("streams") or [{}])[0]
        width = stream.get("width") or 0
        height = stream.get("height") or 0
        dur = stream.get("duration")
        duration_seconds = int(round(float(dur))) if dur else None
        ar = stream.get("display_aspect_ratio")
        if not ar and width and height:
            from math import gcd
            g = gcd(int(width), int(height)) or 1
            ar = f"{int(width)//g}:{int(height)//g}"
        return {
            "duration": duration_seconds,
            "aspect_ratio": ar,
            "width": width,
            "height": height,
        }
    except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError,
            json.JSONDecodeError, ValueError):
        return {}


# ── Sidecar parsing ─────────────────────────────────────────────

def parse_mp4_sidecar(sidecar_path: Path) -> dict:
    """Extract {model, prompt, cost, duration, aspect_ratio, mode, shot_id} from
    a pre-existing .mp4.json sidecar, or {} if absent/stub/corrupt.
    """
    try:
        data = json.loads(sidecar_path.read_text(encoding="utf-8"))
    except (FileNotFoundError, json.JSONDecodeError):
        return {}
    prov = data.get("provenance") or {}
    if not prov:
        return {}  # stub sidecar — no useful data
    gp = prov.get("generation_params") or {}
    return {
        "model": prov.get("model"),
        "prompt": prov.get("prompt"),
        "cost": prov.get("cost"),
        "duration": gp.get("duration"),
        "aspect_ratio": gp.get("aspect_ratio"),
        "mode": gp.get("mode"),
        "shot_id": prov.get("shot_id"),
        "pipeline": prov.get("pipeline"),
    }


# ── Classification ──────────────────────────────────────────────

def classify_confidence(sidecar_bits: dict, ffprobe_bits: dict, parsed: dict) -> str:
    """Assign confidence based on what was recovered."""
    has_sidecar = bool(sidecar_bits.get("model") or sidecar_bits.get("prompt"))
    has_filename = bool(parsed.get("shot_id"))
    has_ffprobe = bool(ffprobe_bits.get("duration"))
    if has_sidecar:
        return "high"
    if has_filename and has_ffprobe:
        return "medium"
    return "low"


# ── Synthetic meta.yaml emission ────────────────────────────────

def build_synthetic_meta(
    video: Path,
    parsed: dict,
    sidecar_bits: dict,
    ffprobe_bits: dict,
    confidence: str,
) -> dict:
    """Construct the synthetic meta.yaml dict."""
    mtime = datetime.fromtimestamp(video.stat().st_mtime, tz=timezone.utc)
    shot_id = parsed["shot_id"]
    take_number = parsed["take_number"]

    prompt_text = sidecar_bits.get("prompt") or "(unknown — synthetic reclaim)"
    prompt_word_count = len((prompt_text or "").split())
    model = sidecar_bits.get("model") or parsed["hints"].get("model") or "unknown"
    mode = sidecar_bits.get("mode") or parsed["hints"].get("pipeline") or "unknown"
    endpoint = {
        "image2video": "i2v",
        "text2video": "t2v",
        "i2v": "i2v",
        "t2v": "t2v",
        "r2v": "r2v",
        "multishot": "multi_shot",
        "multi_shot": "multi_shot",
    }.get(mode, "unknown")

    duration = sidecar_bits.get("duration") or ffprobe_bits.get("duration")
    aspect_ratio = sidecar_bits.get("aspect_ratio") or ffprobe_bits.get("aspect_ratio") or "16:9"
    cost = float(sidecar_bits.get("cost") or 0.0)

    inference_sources: list[str] = ["filename"]
    if sidecar_bits:
        inference_sources.append("sidecar_json")
    if ffprobe_bits:
        inference_sources.append("ffprobe")

    # Boundary frame detection (best-effort)
    boundary_dir = video.parent / "boundary_frames"
    start_frame = None
    if boundary_dir.is_dir():
        match = list(boundary_dir.glob(f"{shot_id}*_start.*"))
        if match:
            start_frame = str(match[0])
            inference_sources.append("boundary_frames")

    unknown_fields: list[str] = []
    if model == "unknown":
        unknown_fields.append("generation.model")
    if prompt_text == "(unknown — synthetic reclaim)":
        unknown_fields.append("generation.prompt_text")
    if not start_frame:
        unknown_fields.append("generation.inputs.start_frame")

    synth_id = f"{shot_id}_{int(mtime.timestamp())}"
    meta = {
        "generation": {
            "id": synth_id,
            "timestamp": mtime.isoformat().replace("+00:00", "Z"),
            "model": model,
            "endpoint": endpoint,
            "prompt_builder": "synthetic_reclaim",
            "prompt_text": prompt_text,
            "prompt_word_count": prompt_word_count,
            "inputs": {
                "character_refs": [],
                "start_frame": start_frame,
            },
            "parameters": {
                "duration": duration,
                "aspect_ratio": aspect_ratio,
            },
            "cost_usd": round(cost, 4),
            "latency_seconds": None,
        },
        "reclaim": {
            "synthetic": True,
            "reclaimed_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
            "inference_sources": inference_sources,
            "confidence": confidence,
            "unknown_fields": unknown_fields,
            "original_filename": video.name,
            "take_number": take_number,
        },
    }
    return meta


# ── Reclaim operation ───────────────────────────────────────────

def _classify_one(video: Path) -> dict:
    """Inspect an orphan once. Returns the bundle reused by both classify
    and apply paths so we don't re-run ffprobe + sidecar parse twice."""
    sidecar = video.with_suffix(video.suffix + ".json")  # {name}.mp4.json
    sidecar_bits = parse_mp4_sidecar(sidecar) if sidecar.is_file() else {}
    ffprobe_bits = run_ffprobe(video)
    parsed = parse_orphan_filename(video.stem)
    confidence = classify_confidence(sidecar_bits, ffprobe_bits, parsed)
    meta = build_synthetic_meta(video, parsed, sidecar_bits, ffprobe_bits, confidence)
    return {
        "video": video,
        "sidecar": sidecar,
        "parsed": parsed,
        "confidence": confidence,
        "meta": meta,
    }


def _record_from_classification(c: dict) -> dict:
    """Shape the standard record dict from a _classify_one bundle."""
    meta = c["meta"]
    return {
        "video": str(c["video"]),
        "shot_id": c["parsed"]["shot_id"],
        "take_number": c["parsed"]["take_number"],
        "confidence": c["confidence"],
        "inference_sources": meta["reclaim"]["inference_sources"],
        "unknown_fields": meta["reclaim"]["unknown_fields"],
        "applied": False,
    }


def reclaim_one(video: Path, apply: bool, log_path: Path,
                classification: Optional[dict] = None) -> dict:
    """Classify and (if apply) reclaim one orphan. Returns the record dict.

    `classification` may be passed in from a prior _classify_one() call to
    avoid re-running ffprobe + sidecar parse twice (main passes the prior
    bundle through on the apply pass).
    """
    c = classification or _classify_one(video)
    record = _record_from_classification(c)

    if not apply:
        return record

    import yaml

    sidecar = c["sidecar"]
    meta = c["meta"]
    shot_id = c["parsed"]["shot_id"]
    take_number = c["parsed"].get("take_number", 1)
    # destination = parent of _orphans/ (the normal episode dir).
    # Note: orphans are gathered via a non-recursive `_orphans/*.mp4` glob
    # in main(), so _orphans/ is always exactly one level deep.
    dest_dir = video.parent.parent

    # take==1 keeps the canonical `{shot_id}_meta.yaml` name (matches the
    # rest of the recoil pipeline, where meta.yaml is keyed by shot_id
    # alone for the latest take). take>1 disambiguates with `_take{N}` so
    # multiple takes of the same shot_id don't silently overwrite each
    # other's synthetic metas (the idempotence guard above only protects
    # NON-synthetic metas).
    if take_number > 1:
        meta_target = dest_dir / f"{shot_id}_take{take_number}_meta.yaml"
    else:
        meta_target = dest_dir / f"{shot_id}_meta.yaml"
    video_target = dest_dir / video.name
    sidecar_target = dest_dir / sidecar.name if sidecar.is_file() else None

    # Idempotence: if a non-reclaim meta.yaml already exists at target,
    # DO NOT overwrite it.
    if meta_target.is_file():
        try:
            existing = yaml.safe_load(meta_target.read_text(encoding="utf-8")) or {}
            if not (existing.get("reclaim") or {}).get("synthetic"):
                record["applied"] = False
                record["skipped"] = "target_meta_exists_not_synthetic"
                return record
        except Exception:
            record["applied"] = False
            record["skipped"] = "target_meta_unparseable"
            return record

    # Write meta.yaml (atomic via tmp + replace; clean up tmp on failure)
    tmp_meta = meta_target.with_suffix(".yaml.tmp")
    try:
        with tmp_meta.open("w", encoding="utf-8") as f:
            yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
        os.replace(tmp_meta, meta_target)
    except Exception:
        if tmp_meta.exists():
            try:
                tmp_meta.unlink()
            except OSError:
                pass
        raise

    # Move video out of _orphans/. Resolve boundary_dir BEFORE the move
    # (after shutil.move, the source file is gone — but Path objects are
    # just strings, so video.parent still points to _orphans/. We compute
    # it eagerly anyway to make the source-relative read explicit.)
    boundary_dir = video.parent / "boundary_frames"

    if video_target.resolve() != video.resolve():
        shutil.move(str(video), str(video_target))
    if sidecar_target and sidecar.is_file():
        shutil.move(str(sidecar), str(sidecar_target))

    # Move boundary frames adjacent to the shot, if the dir exists
    if boundary_dir.is_dir():
        for bf in boundary_dir.glob(f"{shot_id}*"):
            bf_target = dest_dir / "boundary_frames" / bf.name
            bf_target.parent.mkdir(parents=True, exist_ok=True)
            if bf.is_file():
                shutil.move(str(bf), str(bf_target))

    record["applied"] = True
    record["meta_yaml"] = str(meta_target)

    # Log reclaim_log.jsonl (one-line-per-entry)
    with log_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")

    return record


# ── CLI ──────────────────────────────────────────────────────────

def main(argv: Optional[list[str]] = None) -> int:
    p = argparse.ArgumentParser(prog="reclaim_orphans.py")
    p.add_argument("--project", required=True)
    p.add_argument("--dry-run", action="store_true")
    p.add_argument("--apply", action="store_true")
    p.add_argument("--confidence-min", choices=["low", "medium", "high"], default="medium")
    p.add_argument("--only", default=None, help="glob filter against mp4 filename")
    args = p.parse_args(argv)

    # Mode guard — orphan reclamation is a microdrama-only operation.
    # Hard error rather than warn: running --apply on a client_deliverable
    # project would move files out of _orphans/, breaking active editorial
    # references. Defense-in-depth for the namespacing system.
    from recoil.core.project import get_project
    project_obj = get_project(args.project)
    if not project_obj.uses_pass_naming:
        print(
            f"ERROR: project '{args.project}' is mode '{project_obj.mode.value}' "
            f"— orphan reclamation does not apply to this project type.",
            file=sys.stderr,
        )
        return 1

    if args.apply and args.dry_run:
        p.error("cannot pass both --apply and --dry-run")
    apply = args.apply and not args.dry_run

    project_root = ProjectPaths.for_project(args.project).renders_dir
    if not project_root.is_dir():
        print(f"[reclaim] no video root: {project_root}")
        return 2

    orphans: list[Path] = []
    for ep_dir in project_root.iterdir():
        if not ep_dir.is_dir():
            continue
        orph_dir = ep_dir / "_orphans"
        if not orph_dir.is_dir():
            continue
        for mp4 in orph_dir.glob("*.mp4"):
            if args.only and not fnmatch.fnmatch(mp4.name, args.only):
                continue
            orphans.append(mp4)

    print(f"[reclaim] found {len(orphans)} orphan(s); mode={'APPLY' if apply else 'DRY-RUN'}")
    min_rank = CONFIDENCE_RANK[args.confidence_min]

    log_path = project_root / "_reclaim_log.jsonl"
    touched = 0
    skipped = 0
    for video in sorted(orphans):
        # Classify once; reuse the bundle on the apply pass so we don't
        # re-run ffprobe + sidecar parse twice per video.
        classification = _classify_one(video)
        record = _record_from_classification(classification)
        rank = CONFIDENCE_RANK.get(record["confidence"], 0)
        if rank < min_rank:
            print(f"  SKIP (confidence={record['confidence']} < {args.confidence_min}): {video.name}")
            skipped += 1
            continue
        print(f"  {record['confidence'].upper():6s}  {video.name} -> shot_id={record['shot_id']} take={record['take_number']}")
        if apply:
            applied = reclaim_one(video, apply=True, log_path=log_path,
                                  classification=classification)
            if applied.get("applied"):
                touched += 1

    print(f"[reclaim] done. classified={len(orphans)} skipped={skipped} applied={touched}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
