#!/usr/bin/env python3
"""
find_broken_refs.py — Scan project assets for broken ref images.

Checks every image file under projects/{project}/assets/ for:
  - File size < 1024 bytes (broken LFS pointer stubs)
  - Missing image header magic (PNG, JPEG, WEBP, GIF)

Reports broken files with size, content preview, and suggested recovery action.

Usage:
    python3 -m recoil.pipeline.tools.find_broken_refs --project tartarus
    python3 -m recoil.pipeline.tools.find_broken_refs --all
"""
import argparse
import json
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

from recoil.core.paths import (
    ProjectPaths, projects_root, _integrity_check,
    BrokenRefError, _IMAGE_EXTENSIONS_RESOLVER,
)


# REC-236: the git-lfs pointer header. A ref file whose content begins with this
# is an un-materialized LFS pointer (not pulled) — genuinely invalid dispatch
# input, but with a specific remediation (`git lfs pull`) distinct from a
# truly-broken/stub ref.
_LFS_POINTER_SIGNATURE = b"version https://git-lfs.github.com/spec/v1"
_LFS_REMEDIATION = "run `git lfs pull`"
_BROKEN_REMEDIATION = "broken/stub ref — restore from base/pool/identity"


@dataclass(frozen=True)
class BrokenRef:
    """One ref path that failed the integrity check (REC-236).

    `remediation` is LFS-aware: an un-materialized LFS pointer says
    `git lfs pull`; anything else says restore the ref. Same pass/fail — both
    block — but the operator-facing fix differs.
    """
    path: Path
    size: int
    reason: str
    remediation: str


def _remediation_for(path: Path) -> str:
    """Branch the remediation message on the LFS-pointer signature."""
    try:
        head = path.read_bytes()[: len(_LFS_POINTER_SIGNATURE)]
    except OSError:
        head = b""
    if head.startswith(_LFS_POINTER_SIGNATURE):
        return _LFS_REMEDIATION
    return _BROKEN_REMEDIATION


def check_paths(paths: Iterable[Path]) -> list[BrokenRef]:
    """Validate each ref path. Returns a BrokenRef per failure (REC-236 decision 4).

    Per-ref assertion: resolve symlinks to the FINAL target → exists() → is a
    regular file → `_integrity_check` (size >= MIN AND image-magic). On any
    failure, emit a BrokenRef whose remediation branches on the LFS-pointer
    signature. This is the ONE checker; the CLI (`scan_project`) and the
    EpisodeRunner pre-submit preflight both call it (no logic duplication).
    """
    broken: list[BrokenRef] = []
    for raw in paths:
        p = Path(raw)
        # Resolve symlinks to the FINAL target. strict=False so a dangling
        # symlink yields a non-existent target we then catch below.
        target = p.resolve()
        if not target.exists():
            broken.append(BrokenRef(
                path=p, size=0,
                reason=f"ref does not exist (resolved to {target})",
                remediation=_BROKEN_REMEDIATION,
            ))
            continue
        if not target.is_file():
            broken.append(BrokenRef(
                path=p, size=0,
                reason=f"ref is not a regular file: {target}",
                remediation=_BROKEN_REMEDIATION,
            ))
            continue
        try:
            _integrity_check(target)
        except BrokenRefError as e:
            try:
                size = target.stat().st_size
            except OSError:
                size = 0
            broken.append(BrokenRef(
                path=p, size=size,
                reason=str(e).split("\n")[0],
                remediation=_remediation_for(target),
            ))
    return broken


def scan_project(project: str) -> list[dict]:
    """Scan all asset images in a project. Returns list of broken-file dicts."""
    paths = ProjectPaths.for_project(project)
    broken = []
    assets_dir = paths.assets_dir
    if not assets_dir.is_dir():
        return broken
    candidates = [
        img for img in sorted(assets_dir.rglob("*"))
        if img.is_file() and img.suffix.lower() in (".png", ".jpeg", ".jpg", ".webp")
    ]
    for ref in check_paths(candidates):
        img = ref.path
        preview = ""
        try:
            preview = img.read_bytes()[:200].decode("utf-8", errors="replace")
        except OSError:
            pass
        # Check for .jpeg sibling (recovery path 1)
        sibling = None
        for ext in _IMAGE_EXTENSIONS_RESOLVER:
            if ext == img.suffix.lower():
                continue
            candidate = img.with_suffix(ext)
            if candidate.exists() and candidate.stat().st_size >= 1024:
                sibling = str(candidate)
                break
        broken.append({
            "path": str(img),
            "relative": str(img.relative_to(paths.project_root)),
            "size": ref.size,
            "preview": preview[:120],
            "error": ref.reason,
            "recovery_sibling": sibling,
        })
    return broken


def main():
    parser = argparse.ArgumentParser(description="Find broken ref images")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--project", help="Project slug to scan")
    group.add_argument("--all", action="store_true", help="Scan all projects")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    args = parser.parse_args()

    projects = []
    if args.all:
        root = projects_root()
        projects = [d.name for d in sorted(root.iterdir())
                    if d.is_dir() and not d.name.startswith((".", "_"))]
    else:
        projects = [args.project]

    all_broken = {}
    total = 0
    for proj in projects:
        broken = scan_project(proj)
        if broken:
            all_broken[proj] = broken
            total += len(broken)

    if args.json:
        print(json.dumps(all_broken, indent=2))
    else:
        if not all_broken:
            print(f"No broken refs found across {len(projects)} project(s).")
            return
        print(f"{total} broken ref(s) found across {len(all_broken)} project(s):\n")
        for proj, broken_list in all_broken.items():
            print(f"  {proj}/ ({len(broken_list)} broken):")
            for b in broken_list:
                print(f"    {b['relative']}  ({b['size']} bytes)")
                if b["recovery_sibling"]:
                    print(f"      -> SIBLING EXISTS: {b['recovery_sibling']}")
                if b["preview"]:
                    print(f"      -> Preview: {b['preview']!r}")

    sys.exit(1 if total > 0 else 0)


if __name__ == "__main__":
    main()
