# ==============================================================================
# reconcile_assets.py — Asset reconciliation: staleness, orphans, lineage
# DATE: 2026-05-30
# PHASE: v3 Layout + Ref Taxonomy Migration, Phase 2
# ==============================================================================
"""
reconcile_assets.py — Reconcile look-folder assets against content-hash lineage.

Provides:
  - is_stale(ref_path, hashcache)     — True if a derivative's source has changed
  - bless_ref(ref_path)               — Stamp current source hash into sidecar
  - reconcile_subject(look_dir, ...)  — Full reconcile pass on one look folder
  - reconcile_all(assets_dir)         — Reconcile every look folder under assets/

Reconciliation maintains:
  - {stem}.STALE / {stem}.UNVERIFIED markers alongside images
  - _LINEAGE.txt — human-readable derivation chain per look folder
  - _USED_IN.txt — episodes using this look (from episode_look_map.json)
  - _hashcache.json — size-gated SHA-256 cache to avoid re-hashing unchanged files
  - .jpg → .jpeg extension normalization (rename, no re-encode)
"""

import hashlib
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

from recoil.core.paths import ProjectPaths

logger = logging.getLogger(__name__)

_IMAGE_EXTENSIONS = (".jpeg", ".png", ".webp", ".jpg")
_SIDECAR_SUFFIX = ".json"  # appended to full filename: foo.jpeg → foo.jpeg.json


# ── Dataclasses ─────────────────────────────────────────────────────

@dataclass
class ReconcileReport:
    """Result of a single look-folder reconciliation pass."""
    orphans_deleted: int = 0
    sidecars_created: int = 0
    stale_refs: int = 0
    fresh_refs: int = 0
    markers_written: int = 0


# ── Hash helpers ────────────────────────────────────────────────────

def _sha256(path: Path) -> str:
    """Compute SHA-256 hex digest of a file."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()


def _cached_sha256(path: Path, hashcache: dict) -> str:
    """Return SHA-256 of path, using hashcache to skip re-hashing when
    file size has not changed.  Updates hashcache in-place."""
    key = str(path)
    size = path.stat().st_size
    entry = hashcache.get(key)
    if entry and entry.get("size") == size:
        return entry["sha256"]
    digest = _sha256(path)
    hashcache[key] = {"size": size, "sha256": digest}
    return digest


def _load_hashcache(hashcache_path: Optional[Path]) -> dict:
    """Load _hashcache.json, returning empty dict if missing or corrupt."""
    if hashcache_path and hashcache_path.is_file():
        try:
            return json.loads(hashcache_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            logger.warning("Corrupt hashcache at %s — rebuilding", hashcache_path)
    return {}


def _save_hashcache(hashcache: dict, hashcache_path: Path) -> None:
    """Persist hashcache to disk."""
    hashcache_path.write_text(
        json.dumps(hashcache, indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )


# ── Source probing ──────────────────────────────────────────────────

def _probe_source(parent: Path, source_stem: Optional[str]) -> Optional[Path]:
    """Find the source file by stem in the parent directory tree.

    Walks up from the look-dir root (not pool/) looking for {source_stem}.{ext}.
    Also checks inside pool/ subdirectories.
    """
    if not source_stem:
        return None

    # Check the look-dir root itself
    look_dir = parent
    # If we're inside a pool subdir (or pool/kind/4k/), walk up to the look dir
    if look_dir.name == "4k":
        look_dir = look_dir.parent  # kind dir
    if look_dir.name in ("identity", "turn", "closeup", "fullbody", "expr"):
        look_dir = look_dir.parent  # pool/
    if look_dir.name == "pool":
        look_dir = look_dir.parent  # look dir

    # Search look dir root (hero shelf)
    for ext in _IMAGE_EXTENSIONS:
        candidate = look_dir / f"{source_stem}{ext}"
        if candidate.exists():
            return candidate

    # Search pool subdirectories
    pool_dir = look_dir / "pool"
    if pool_dir.is_dir():
        for kind_dir in pool_dir.iterdir():
            if kind_dir.is_dir():
                for ext in _IMAGE_EXTENSIONS:
                    candidate = kind_dir / f"{source_stem}{ext}"
                    if candidate.exists():
                        return candidate

    return None


# ── Sidecar helpers ─────────────────────────────────────────────────

def _sidecar_path(ref_path: Path) -> Path:
    """Return the sidecar JSON path for a ref image."""
    return ref_path.with_suffix(ref_path.suffix + _SIDECAR_SUFFIX)


def _read_sidecar(ref_path: Path) -> Optional[dict]:
    """Read a sidecar JSON, returning None if missing or corrupt."""
    sp = _sidecar_path(ref_path)
    if not sp.is_file():
        return None
    try:
        return json.loads(sp.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        logger.warning("Corrupt sidecar: %s", sp)
        return None


def _write_sidecar(ref_path: Path, data: dict) -> None:
    """Write a sidecar JSON file."""
    sp = _sidecar_path(ref_path)
    sp.write_text(
        json.dumps(data, indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )


# ── Public: staleness check ────────────────────────────────────────

def is_stale(ref_path: Path, hashcache: dict) -> bool:
    """Check whether a ref is stale relative to its recorded source.

    Returns True if:
      - No sidecar exists
      - derived_from.source_sha256 is missing or __UNVERIFIED__
      - The source file cannot be found
      - The source file's current hash differs from the recorded hash
    """
    sidecar = _read_sidecar(ref_path)
    if sidecar is None:
        return True
    df = sidecar.get("derived_from", {})
    source_sha = df.get("source_sha256")
    if source_sha in (None, "__UNVERIFIED__"):
        return True  # migration sentinel
    source_stem = df.get("source_stem")
    source_path = _probe_source(ref_path.parent, source_stem)
    if not source_path:
        return True
    return _cached_sha256(source_path, hashcache) != source_sha


# ── Public: bless_ref ──────────────────────────────────────────────

def bless_ref(ref_path: Path) -> None:
    """Stamp a ref's sidecar with the current source hash, clearing UNVERIFIED.

    Reads the sidecar's derived_from.source_stem, finds the source file,
    computes its SHA-256, and writes it into derived_from.source_sha256.
    Also removes any .UNVERIFIED marker file.
    """
    sidecar = _read_sidecar(ref_path)
    if sidecar is None:
        raise FileNotFoundError(f"No sidecar for {ref_path}")

    df = sidecar.get("derived_from", {})
    source_stem = df.get("source_stem")
    if not source_stem:
        raise ValueError(
            f"Cannot bless {ref_path}: sidecar has no derived_from.source_stem"
        )

    source_path = _probe_source(ref_path.parent, source_stem)
    if not source_path:
        raise FileNotFoundError(
            f"Cannot bless {ref_path}: source '{source_stem}' not found"
        )

    df["source_sha256"] = _sha256(source_path)
    sidecar["derived_from"] = df
    _write_sidecar(ref_path, sidecar)

    # Remove UNVERIFIED marker if present
    unverified_marker = ref_path.parent / f"{ref_path.stem}.UNVERIFIED"
    if unverified_marker.exists():
        unverified_marker.unlink()

    # Remove STALE marker if present (blessing implies fresh)
    stale_marker = ref_path.parent / f"{ref_path.stem}.STALE"
    if stale_marker.exists():
        stale_marker.unlink()

    logger.info("Blessed %s with source hash from %s", ref_path.name, source_path.name)


# ── Extension normalization ─────────────────────────────────────────

def _normalize_jpg_extension(directory: Path) -> int:
    """Rename .jpg → .jpeg in a directory (no re-encode). Returns count."""
    count = 0
    for jpg_file in list(directory.glob("*.jpg")):
        new_path = jpg_file.with_suffix(".jpeg")
        if new_path.exists():
            logger.warning(
                "Cannot rename %s → %s: target already exists",
                jpg_file.name, new_path.name,
            )
            continue
        # Rename the image
        jpg_file.rename(new_path)
        # Rename sidecar if it exists
        old_sidecar = jpg_file.with_suffix(".jpg.json")
        if old_sidecar.exists():
            old_sidecar.rename(new_path.with_suffix(".jpeg.json"))
        count += 1
        logger.info("Normalized extension: %s → %s", jpg_file.name, new_path.name)
    return count


# ── Marker file management ─────────────────────────────────────────

def _write_stale_marker(ref_path: Path, reason: str) -> Path:
    """Create a content-bearing .STALE marker file alongside a ref."""
    marker = ref_path.parent / f"{ref_path.stem}.STALE"
    marker.write_text(reason + "\n", encoding="utf-8")
    return marker


def _write_unverified_marker(ref_path: Path) -> Path:
    """Create a content-bearing .UNVERIFIED marker file alongside a ref."""
    marker = ref_path.parent / f"{ref_path.stem}.UNVERIFIED"
    marker.write_text(
        f"UNVERIFIED: {ref_path.name} has no verified source hash.\n"
        f"Run bless_ref() after confirming derivation lineage.\n",
        encoding="utf-8",
    )
    return marker


def _clear_markers(ref_path: Path) -> int:
    """Remove .STALE and .UNVERIFIED markers for a ref. Returns count removed."""
    count = 0
    for suffix in (".STALE", ".UNVERIFIED"):
        marker = ref_path.parent / f"{ref_path.stem}{suffix}"
        if marker.exists():
            marker.unlink()
            count += 1
    return count


# ── _LINEAGE.txt generation ────────────────────────────────────────

def _generate_lineage(look_dir: Path) -> None:
    """Generate _LINEAGE.txt from all sidecars in the look dir.

    Produces a human-readable derivation chain showing which refs
    derive from which sources.
    """
    lines: List[str] = []
    lines.append(f"# Lineage for {look_dir.name}")
    lines.append("# Auto-generated by reconcile_assets.py — do not edit")
    lines.append("")

    # Collect all image files at look-dir root (hero shelf)
    hero_refs = sorted(
        f for f in look_dir.iterdir()
        if f.is_file() and f.suffix.lower() in _IMAGE_EXTENSIONS
    )

    for ref in hero_refs:
        sidecar = _read_sidecar(ref)
        if sidecar is None:
            lines.append(f"{ref.name}: no sidecar")
            continue

        df = sidecar.get("derived_from", {})
        source_stem = df.get("source_stem")
        source_kind = df.get("source_kind")
        source_sha = df.get("source_sha256", "unknown")
        content_sha = sidecar.get("content_sha256", "unknown")
        kind = sidecar.get("kind", "unknown")

        if source_stem:
            sha_display = source_sha[:12] if source_sha and source_sha != "__UNVERIFIED__" else source_sha or "none"
            lines.append(
                f"{ref.name} ({kind}) ← {source_stem} ({source_kind}) "
                f"[source_sha: {sha_display}]"
            )
        else:
            lines.append(f"{ref.name} ({kind}) — primary source [sha: {content_sha[:12] if content_sha != 'unknown' else 'unknown'}]")

    lines.append("")
    lineage_path = look_dir / "_LINEAGE.txt"
    lineage_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    logger.debug("Wrote _LINEAGE.txt in %s", look_dir)


# ── _USED_IN.txt generation ────────────────────────────────────────

def _generate_used_in(look_dir: Path, assets_dir: Path) -> None:
    """Generate _USED_IN.txt from episode_look_map.json.

    The look map lives at _pipeline/state/visual/episode_look_map.json
    (resolved via ProjectPaths.episode_look_map) and maps episode
    identifiers to the looks they reference.
    """
    map_path = ProjectPaths.from_root(assets_dir.parent).episode_look_map
    if not map_path.is_file():
        return  # No map yet — skip silently

    try:
        look_map = json.loads(map_path.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        logger.warning("Corrupt episode_look_map.json at %s", map_path)
        return

    # Determine this look's identity: class/subject/look from directory path
    # Expected structure: assets/{class}/{subject}/{look}/
    look_name = look_dir.name
    subject_dir = look_dir.parent
    class_dir = subject_dir.parent
    subject_name = subject_dir.name
    class_name = class_dir.name

    # Build a lookup key — the map may use various key formats
    # Support: "class/subject/look", "subject/look", "subject"
    lookup_keys = [
        f"{class_name}/{subject_name}/{look_name}",
        f"{subject_name}/{look_name}",
        subject_name,
    ]

    episodes_using: List[str] = []

    # The map is typically structured as:
    # { "ep_001": { "jade": "base", ... }, ... }
    # OR { "ep_001": ["char/jade/base", ...], ... }
    for ep_id, ep_data in look_map.items():
        if isinstance(ep_data, dict):
            # { subject: look_name } format
            if ep_data.get(subject_name) == look_name:
                episodes_using.append(ep_id)
        elif isinstance(ep_data, list):
            # List of "class/subject/look" strings
            for key in lookup_keys:
                if key in ep_data:
                    episodes_using.append(ep_id)
                    break

    if not episodes_using:
        # Remove stale _USED_IN.txt if this look is no longer referenced
        used_in_path = look_dir / "_USED_IN.txt"
        if used_in_path.exists():
            used_in_path.unlink()
        return

    episodes_using.sort()
    lines = [
        f"# Episodes using {class_name}/{subject_name}/{look_name}",
        "# Auto-generated by reconcile_assets.py — do not edit",
        "",
    ]
    for ep in episodes_using:
        lines.append(f"- {ep}")
    lines.append("")

    used_in_path = look_dir / "_USED_IN.txt"
    used_in_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    logger.debug("Wrote _USED_IN.txt in %s (%d episodes)", look_dir, len(episodes_using))


# ── Orphan detection ────────────────────────────────────────────────

def _detect_and_delete_orphan_sidecars(directory: Path) -> int:
    """Delete sidecar .json files whose parent image no longer exists.
    Returns count of orphans deleted."""
    count = 0
    for sidecar in list(directory.glob("*.json")):
        # Skip non-sidecar JSON (e.g., _hashcache.json, char.json)
        if sidecar.name.startswith("_") or sidecar.stem in ("char", "loc", "prop"):
            continue
        # Sidecar is named {image_name}.json, e.g. jade_identity.jpeg.json
        # The image name is the sidecar stem (without trailing .json)
        image_name = sidecar.stem  # e.g. "jade_identity.jpeg"
        image_path = directory / image_name
        if not image_path.exists():
            sidecar.unlink()
            count += 1
            logger.info("Deleted orphan sidecar: %s", sidecar)
    return count


# ── Public: reconcile_subject ───────────────────────────────────────

def reconcile_subject(
    look_dir: Path,
    hashcache_path: Optional[Path] = None,
) -> ReconcileReport:
    """Run a full reconcile pass on a single look folder.

    Steps:
      1. Normalize .jpg → .jpeg in pool subdirectories
      2. Load hash cache
      3. Delete orphan sidecars (image deleted but sidecar remains)
      4. For each image at look-dir root (hero shelf):
         a. Check if sidecar exists — create stub if missing
         b. Check staleness via is_stale()
         c. Write/clear .STALE and .UNVERIFIED markers
      5. Generate _LINEAGE.txt
      6. Generate _USED_IN.txt (if episode_look_map.json exists)
      7. Save hash cache
    """
    report = ReconcileReport()

    if not look_dir.is_dir():
        logger.warning("Look dir does not exist: %s", look_dir)
        return report

    # Determine assets_dir (grandparent of subject, great-grandparent of look)
    # look_dir = assets/{class}/{subject}/{look}/
    assets_dir = look_dir.parent.parent.parent

    # 1. Extension normalization in pool dirs
    pool_dir = look_dir / "pool"
    if pool_dir.is_dir():
        for kind_dir in pool_dir.iterdir():
            if kind_dir.is_dir():
                _normalize_jpg_extension(kind_dir)

    # 2. Load hash cache
    if hashcache_path is None:
        ppaths = ProjectPaths.from_root(assets_dir.parent)
        ppaths.visual_state_dir.mkdir(parents=True, exist_ok=True)
        hashcache_path = ppaths.hashcache
    hashcache = _load_hashcache(hashcache_path)

    # 3. Delete orphan sidecars at look-dir root
    report.orphans_deleted += _detect_and_delete_orphan_sidecars(look_dir)

    # Also clean orphan sidecars in pool subdirectories
    if pool_dir.is_dir():
        for kind_dir in pool_dir.iterdir():
            if kind_dir.is_dir():
                report.orphans_deleted += _detect_and_delete_orphan_sidecars(kind_dir)

    # 4. Check each hero ref on the shelf
    hero_refs = sorted(
        f for f in look_dir.iterdir()
        if f.is_file() and f.suffix.lower() in _IMAGE_EXTENSIONS
    )

    for ref in hero_refs:
        sidecar = _read_sidecar(ref)

        # Create stub sidecar if missing
        if sidecar is None:
            stub = {
                "subject": look_dir.parent.name,
                "class": look_dir.parent.parent.name,
                "look": look_dir.name,
                "kind": _infer_kind_from_name(ref.stem, look_dir.parent.name),
                "derived_from": {
                    "source_sha256": "__UNVERIFIED__",
                },
            }
            _write_sidecar(ref, stub)
            report.sidecars_created += 1
            sidecar = stub

        # Check staleness
        stale = is_stale(ref, hashcache)

        if stale:
            report.stale_refs += 1
            df = sidecar.get("derived_from", {})
            source_sha = df.get("source_sha256")

            if source_sha in (None, "__UNVERIFIED__"):
                # Unverified — write UNVERIFIED marker
                _write_unverified_marker(ref)
                report.markers_written += 1
            else:
                # Genuinely stale — source changed
                source_stem = df.get("source_stem", "unknown")
                reason = (
                    f"STALE: {ref.name} derived from {source_stem}\n"
                    f"       source hash has changed since derivation\n"
                    f"       → re-derive from current source, or bless_ref() to accept"
                )
                _write_stale_marker(ref, reason)
                report.markers_written += 1
        else:
            report.fresh_refs += 1
            cleared = _clear_markers(ref)
            if cleared:
                logger.debug("Cleared %d stale markers for %s", cleared, ref.name)

    # 5. Generate _LINEAGE.txt
    _generate_lineage(look_dir)

    # 6. Generate _USED_IN.txt
    _generate_used_in(look_dir, assets_dir)

    # 7. Save hash cache
    _save_hashcache(hashcache, hashcache_path)

    logger.info(
        "Reconciled %s: %d stale, %d fresh, %d orphans deleted, "
        "%d sidecars created, %d markers written",
        look_dir, report.stale_refs, report.fresh_refs,
        report.orphans_deleted, report.sidecars_created,
        report.markers_written,
    )

    return report


# ── Internal helpers ────────────────────────────────────────────────

def _infer_kind_from_name(stem: str, subject: str) -> str:
    """Infer ref kind from filename stem.

    Expected naming: {subject}_{kind} or {subject}_{kind}_4k.
    Falls back to 'identity' if kind cannot be determined.
    """
    from recoil.core.paths import VALID_REF_TYPES

    # Strip subject prefix
    if stem.startswith(subject + "_"):
        remainder = stem[len(subject) + 1:]
    else:
        return "identity"

    # Strip _4k suffix if present
    if remainder.endswith("_4k"):
        remainder = remainder[:-3]

    if remainder in VALID_REF_TYPES:
        return remainder

    return "identity"


# ── Public: reconcile_all ───────────────────────────────────────────

def reconcile_all(assets_dir: Path) -> List[ReconcileReport]:
    """Reconcile every look folder under an assets directory.

    Walks assets/{class}/{subject}/{look}/ for all valid asset classes.

    Returns a list of ReconcileReport, one per look folder processed.
    """
    from recoil.core.paths import VALID_ASSET_CLASSES

    reports: List[ReconcileReport] = []

    if not assets_dir.is_dir():
        logger.warning("Assets dir does not exist: %s", assets_dir)
        return reports

    ppaths = ProjectPaths.from_root(assets_dir.parent)
    ppaths.visual_state_dir.mkdir(parents=True, exist_ok=True)
    hashcache_path = ppaths.hashcache

    for class_name in sorted(VALID_ASSET_CLASSES):
        class_dir = assets_dir / class_name
        if not class_dir.is_dir():
            continue

        for subject_dir in sorted(class_dir.iterdir()):
            if not subject_dir.is_dir():
                continue
            # Skip hidden/special directories
            if subject_dir.name.startswith(("_", ".")):
                continue

            for look_dir in sorted(subject_dir.iterdir()):
                if not look_dir.is_dir():
                    continue
                # Skip hidden/special directories
                if look_dir.name.startswith(("_", ".")):
                    continue

                report = reconcile_subject(
                    look_dir,
                    hashcache_path=hashcache_path,
                )
                reports.append(report)

    logger.info(
        "Reconciled %d look folders under %s",
        len(reports), assets_dir,
    )

    return reports


__all__ = [
    "ReconcileReport",
    "bless_ref",
    "is_stale",
    "reconcile_all",
    "reconcile_subject",
]
