#!/usr/bin/env python3
"""
analyze_reference_scene.py — Gemini Video Analysis for Visual Grammar Bible Research

Analyzes reference scenes (microdramas, anime openings, film scenes) via Gemini
vision to extract shot-by-shot cinematic grammar patterns. Outputs structured
JSON conforming to the Visual Grammar Bible shot_log_schema.

Modes:
    # Single YouTube URL
    python3 analyze_reference_scene.py \\
        --url "https://youtube.com/watch?v=..." \\
        --id "aot_s1e5_transformation" \\
        --title "Eren's First Transformation" \\
        --medium anime_series --genre "action,horror" \\
        --work "Attack on Titan"

    # Local video file
    python3 analyze_reference_scene.py \\
        --file ~/Desktop/reelshort_ep1.mp4 \\
        --id "reelshort_billionaire_ep1" \\
        --title "Double Life Ep 1" \\
        --medium microdrama --genre "romance,revenge"

    # Batch from TSV
    python3 analyze_reference_scene.py \\
        --batch _research/visual_grammar_bible/reference_list.tsv

    # Validate existing corpus file
    python3 analyze_reference_scene.py \\
        --validate _research/visual_grammar_bible/corpus/scene_id.json

    # Aggregate stats across corpus
    python3 analyze_reference_scene.py --summary

    # Calibrate timing accuracy
    python3 analyze_reference_scene.py \\
        --calibrate corpus/scene_id.json ground_truth.tsv

Env vars:
    GOOGLE_API_KEY — Gemini API key (required, GEMINI_API_KEY as fallback)

Dependencies:
    pip install google-genai
"""

import argparse
import csv
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

from recoil.core.model_profiles import get_model

# ── Constants ────────────────────────────────────────────────────────────

MODEL = get_model("flash", "text")
MAX_OUTPUT_TOKENS = 65536
TEMPERATURE = 0.2
RATE_LIMIT_DELAY = 5  # seconds between API calls (15 RPM safe)
MAX_RETRIES = 3

# Schema enum values (embedded in prompt so Gemini knows valid options)
MEDIUM_ENUM = [
    "anime_series", "anime_film", "anime_opening", "microdrama",
    "live_action_film", "live_action_series", "music_video", "amv",
    "webtoon", "manga",
]
NARRATIVE_FUNCTION_ENUM = [
    "protagonist_reveal", "antagonist_reveal", "transformation",
    "betrayal", "confrontation", "chase", "escape",
    "intimate_moment", "loss", "victory", "sacrifice",
    "mystery_reveal", "power_display", "moral_choice",
    "comedic_break", "tension_build", "climax", "aftermath",
    "hook_cold_open", "cliffhanger", "montage", "other",
]
SHOT_TYPE_ENUM = [
    "ECU", "CU", "MCU", "MS", "MLS", "LS", "WIDE", "EWS",
    "POV", "OTS", "INSERT", "VFX", "TITLE",
]
FORMAT_ENUM = ["16:9", "9:16", "4:3", "1:1", "2.39:1", "scroll_vertical"]
CAMERA_MOVEMENT_ENUM = [
    "static", "pan_left", "pan_right", "tilt_up", "tilt_down",
    "push_in", "pull_out", "track_left", "track_right", "crane_up",
    "crane_down", "handheld", "shake", "whip_pan", "rack_focus",
    "zoom_in", "zoom_out", "orbit", "dutch_tilt",
]
TRANSITION_IN_ENUM = [
    "cut", "dissolve", "fade_in", "fade_from_black", "fade_from_white",
    "wipe", "whip_pan_in", "match_cut", "smash_cut", "j_cut", "l_cut",
    "iris_in", "scene_start",
]
TRANSITION_OUT_ENUM = [
    "cut", "dissolve", "fade_out", "fade_to_black", "fade_to_white",
    "wipe", "whip_pan_out", "match_cut", "smash_cut", "j_cut", "l_cut",
    "iris_out", "scene_end",
]
BEAT_ENUM = ["hook", "setup", "escalation", "turn", "cliffhanger", "breath", "transition"]
RHYTHM_ROLE_ENUM = ["hold", "impact", "breath", "rapid", "transition"]
RELATIONSHIP_TYPE_ENUM = [
    "continuation", "contrast", "reaction", "parallel",
    "flashback", "flash_forward", "match_cut", "answer",
]
CAMERA_ANGLE_ENUM = [
    "eye_level", "low", "high", "overhead", "dutch", "worms_eye", "birds_eye",
]
LENS_FEEL_ENUM = ["wide", "normal", "long", "macro", "fisheye"]
DOF_ENUM = ["deep", "medium", "shallow", "rack"]
LIGHTING_KEY_ENUM = ["high", "medium", "low", "silhouette", "practical_only"]
LIGHTING_CONTRAST_ENUM = ["flat", "low", "medium", "high", "extreme"]
LIGHTING_DIRECTION_ENUM = [
    "front", "side", "back", "rim", "below", "above", "ambient", "multiple",
]
MUSIC_STATE_ENUM = [
    "silence", "tension_drone", "building", "crescendo_peak",
    "aftermath_decay", "steady_rhythm", "stinger", "theme_statement", "dissonance",
]
VERTICAL_POSITION_ENUM = [
    "upper_third", "center", "lower_third", "full_frame",
    "off_center_high", "off_center_low",
]
HORIZONTAL_POSITION_ENUM = [
    "center", "left_third", "right_third", "left_edge", "right_edge",
]


# ── Path Resolution ──────────────────────────────────────────────────────

def find_project_root() -> Path:
    """Walk up from this file to find the Recoil project root."""
    candidate = Path(__file__).resolve().parent
    for _ in range(10):
        if (candidate / "tools").is_dir() and (candidate / "editors").is_dir():
            return candidate
        candidate = candidate.parent
    print("ERROR: Could not locate project root (no  found).", file=sys.stderr)
    sys.exit(2)


def corpus_dir() -> Path:
    root = find_project_root()
    d = root / "_research" / "visual_grammar_bible" / "corpus"
    d.mkdir(parents=True, exist_ok=True)
    return d


# ── Gemini Client ────────────────────────────────────────────────────────

def get_api_key() -> str:
    key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
    if not key:
        print(
            "ERROR: GOOGLE_API_KEY (or GEMINI_API_KEY) not set.\n"
            "Run: export GOOGLE_API_KEY=\"your-key-here\"",
            file=sys.stderr,
        )
        sys.exit(1)
    return key


def get_client():
    """Initialize and return google.genai.Client."""
    from google import genai
    return genai.Client(api_key=get_api_key())


# ── Prompt Construction ──────────────────────────────────────────────────

def build_analysis_prompt(
    scene_id: str,
    title: str,
    work: str,
    medium: str,
    genre_tags: list,
    format_ratio: str = "16:9",
) -> str:
    """Build the ~3000-word analysis prompt with all enum values embedded."""

    is_microdrama = medium == "microdrama"

    prompt = f"""# ROLE

You are an expert cinematographer and editor analyzing a video scene shot-by-shot.
Your task is to produce a precise, structured JSON shot log that captures every cut,
every camera decision, every editorial relationship, and every rhythmic pattern.

# INSTRUCTIONS

1. Watch the entire scene first for overall flow and emotional arc.
2. Then go through frame by frame, identifying every cut point.
3. For each shot, estimate the duration in milliseconds as accurately as possible.
4. Identify the relationship between consecutive shots — this is the KEY field for
   editorial grammar extraction.
5. Note patterns that span multiple shots (compression/release, scale ladders, etc).

# SCENE METADATA (pre-filled — include in output as-is)

- scene.id: "{scene_id}"
- scene.title: "{title}"
- scene.source.work: "{work}"
- scene.source.medium: "{medium}"
- scene.source.format: "{format_ratio}"
- scene.genre_tags: {json.dumps(genre_tags)}

# OUTPUT LENGTH MANAGEMENT

Keep your output within the token limit. For scenes longer than 120 seconds:
- Keep "notes" fields to 1 sentence max
- Keep "intent" fields to 1 sentence max
- Keep "sound_design_note" fields brief (1 sentence)
- Omit "sfx" arrays if no notable sound effects
- Focus detail on the most cinematically interesting shots

# ENUM REFERENCE

Use ONLY these values for enum fields:

**shot_type:** {json.dumps(SHOT_TYPE_ENUM)}
**camera.movement:** {json.dumps(CAMERA_MOVEMENT_ENUM)}
**camera.angle:** {json.dumps(CAMERA_ANGLE_ENUM)}
**camera.lens_feel:** {json.dumps(LENS_FEEL_ENUM)}
**camera.depth_of_field:** {json.dumps(DOF_ENUM)}
**transition_in:** {json.dumps(TRANSITION_IN_ENUM)}
**transition_out:** {json.dumps(TRANSITION_OUT_ENUM)}
**lighting.key:** {json.dumps(LIGHTING_KEY_ENUM)}
**lighting.contrast:** {json.dumps(LIGHTING_CONTRAST_ENUM)}
**lighting.direction:** {json.dumps(LIGHTING_DIRECTION_ENUM)}
**audio.music_state:** {json.dumps(MUSIC_STATE_ENUM)}
**beat:** {json.dumps(BEAT_ENUM)}
**rhythm_role:** {json.dumps(RHYTHM_ROLE_ENUM)}
**relationship_to_previous.type:** {json.dumps(RELATIONSHIP_TYPE_ENUM)}
**narrative_function:** {json.dumps(NARRATIVE_FUNCTION_ENUM)}
**framing.vertical_position:** {json.dumps(VERTICAL_POSITION_ENUM)}
**framing.horizontal_position:** {json.dumps(HORIZONTAL_POSITION_ENUM)}

# TIMING GUIDANCE

- Estimate each shot's duration in milliseconds.
- The sum of all shot durations should equal approximately the total scene duration (within ±15%).
- Be precise: a quick insert might be 300ms; a held reaction shot might be 3000ms.
- If you are unsure, err toward slightly longer rather than shorter durations.

# RELATIONSHIP ANALYSIS

The `relationship_to_previous` field is THE most important field for grammar extraction.
For every shot after the first:

- **continuation**: Same action continues from previous shot (e.g., tracking shot picks up where last left off).
- **contrast**: Deliberate break in scale, tempo, tone, or subject (the CUT is the statement).
- **reaction**: This shot is a response to what happened in the previous shot.
- **parallel**: Simultaneous action happening elsewhere (cross-cutting).
- **flashback** / **flash_forward**: Temporal shift.
- **match_cut**: Visual or motion match between shots (a hand rising → sun rising).
- **answer**: This shot answers a visual question posed by the previous shot.

Also fill in:
- **dimension**: WHAT changed (scale, tempo, subject, angle, lighting, location, emotion).
- **intent**: WHY this cut works — free-form directorial insight.
"""

    # Section 7: Two-Peak Structure (microdrama only)
    if is_microdrama:
        prompt += """
# TWO-PEAK STRUCTURE DETECTION (microdrama only)

Microdramas use a specific dual-peak emotional architecture:

**Peak 1 — "The Spike" (~45-55s):**
The resolution of the current micro-conflict. The slap, the reveal, the kiss.
Function: Dopamine (satisfaction). Rewards the viewer.

**"The Breath" (~55-58s):**
Brief reaction/processing moment. The room goes silent. Characters absorb the shock.
Function: Calibration. Resets pacing for the final push.

**Peak 2 — "The Button" (~85-90s):**
The cliffhanger that sets up the NEXT episode. A new threat, a new arrival, a discovery.
Function: Cortisol (anxiety). Forces the viewer to pay/continue.

Identify whether this scene uses this two-peak pattern or a single-peak structure.
If two-peak, note the timestamps and shot indices of the Spike, Breath, and Button.
Also identify the hook technique used in the first 3 seconds.
"""

    # Section 8: Vertical Grammar Markers
    prompt += """
# VERTICAL GRAMMAR MARKERS

Flag these specific patterns when you detect them:

- **rule_of_singles**: 80%+ of shots contain only one person.
- **stacked_blocking**: Foreground shoulder/back with background face (depth stacking in narrow frame).
- **proof_shot**: Document, phone, or object held to lens with rack focus.
- **chin_up_power**: Low angle = dominance, high angle = submission.
- **impact_beats**: Violence shown as face→sound→reaction→room (not choreography).
- **signage_establishing**: Close-up of sign/text replaces wide establishing shot.
- **audio_whoosh**: "Whoosh" sound on transitions.
- **audio_sting**: Sharp musical stinger on reveals/shocks.
- **audio_echo**: Echo/reverb on internal monologue.
- **teleportation**: Characters appear where needed with no travel shown.
- **reveal_striptease**: Power/identity revealed in deliberate layers across shots.
- **logic_compression**: Complex events reduced to single gesture/document.
"""

    # Section 9: Pattern identification
    prompt += """
# PATTERN IDENTIFICATION

In the `patterns_observed` array, identify multi-shot patterns such as:
- **compression_release**: Rapid cuts building tension → sustained hold for impact.
- **scale_ladder**: Progressive scaling (CU→MCU→MS→LS or reverse).
- **staccato_assault**: Sequence of very short shots (<500ms each) for urgency.
- **tight_to_wide_reveal**: Tight framing building mystery → wide shot revealing context.
- **parallel_intercut**: Cross-cutting between two simultaneous actions.
- **rhythmic_breathing**: Alternating long/short shots creating pulse.
- **match_cut_chain**: Series of match cuts building visual connection.
- **silence_impact**: Deliberate audio silence before/after a climactic shot.

For each pattern, note:
- shot_range: [start_index, end_index]
- emotional_function: what emotion it creates
- rhythm_signature: compact notation (e.g., "rapid_rapid_rapid_HOLD_breath")
- relationship_sequence: the relationship_to_previous types across the pattern
"""

    # Section 10: Output format
    prompt += """
# OUTPUT FORMAT

Return ONLY valid JSON (no markdown fences, no commentary outside the JSON).
The JSON must conform to this structure:

{
  "schema_version": "1.0",
  "scene": {
    "id": "<pre-filled>",
    "title": "<pre-filled>",
    "source": {
      "work": "<pre-filled>",
      "medium": "<pre-filled>",
      "format": "<pre-filled>",
      "season": <int or null>,
      "episode": <int or null>,
      "url": <string or null>
    },
    "duration_seconds": <total scene duration as number>,
    "genre_tags": [<pre-filled>],
    "emotional_arc": "<prose description of the emotional journey>",
    "emotional_arc_curve": [<intensity values 1-10 sampled at 5-second intervals>],
    "narrative_function": "<one of the narrative_function enum values>",
    "why_selected": "<why this scene is worth analyzing>"
  },
  "shots": [
    {
      "index": 1,
      "duration_ms": <int>,
      "shot_type": "<enum>",
      "subject": "<what's in frame>",
      "subject_count": <int>,
      "framing": {
        "vertical_position": "<enum>",
        "horizontal_position": "<enum>",
        "headroom": "tight|normal|loose|none",
        "look_direction": "<enum or null>"
      },
      "camera": {
        "movement": "<enum>",
        "movement_speed": "imperceptible|slow|medium|fast|violent" or null,
        "lens_feel": "<enum>",
        "depth_of_field": "<enum>",
        "angle": "<enum>"
      },
      "transition_in": "<enum>",
      "transition_out": "<enum>",
      "lighting": {
        "key": "<enum>",
        "color_temp": "<free text>",
        "contrast": "<enum>",
        "direction": "<enum>"
      },
      "relationship_to_previous": null for first shot, or {
        "type": "<enum>",
        "dimension": "<what changed>",
        "intent": "<why this cut works>"
      },
      "audio": {
        "music_state": "<enum>",
        "music_transition": "<enum or null>",
        "sfx": ["<tagged sound effects>"],
        "dialogue": "<line or null>",
        "silence": <boolean>,
        "audio_intensity": <0-10>,
        "sound_design_note": "<free-form>"
      },
      "emotion": "<primary emotion>",
      "intensity": <1-10>,
      "beat": "<enum>",
      "rhythm_role": "<enum>",
      "screen_direction": "<direction of energy>",
      "text_overlay": <boolean>,
      "notes": "<why this shot works>"
    }
  ],
  "patterns_observed": [
    {
      "pattern_name": "<snake_case>",
      "description": "<what and why>",
      "shot_range": [<start>, <end>],
      "emotional_function": "<effect>",
      "rhythm_signature": "<compact notation>",
      "relationship_sequence": ["<types>"]
    }
  ],
  "microdrama_applicability": {
    "transferable": <boolean>,
    "vertical_adaptation_notes": "<how framing changes for 9:16>",
    "pacing_notes": "<compression/expansion needed?>",
    "patterns_to_extract": ["<pattern names worth codifying>"],
    "patterns_to_skip": ["<patterns that won't transfer>"]
  }"""

    if is_microdrama:
        prompt += """,
  "episode_structure": {
    "has_two_peak": <boolean>,
    "spike_timestamp_s": <seconds or null>,
    "spike_shot_index": <int or null>,
    "spike_description": "<what happens at the spike>",
    "breath_timestamp_s": <seconds or null>,
    "breath_shot_index": <int or null>,
    "button_timestamp_s": <seconds or null>,
    "button_shot_index": <int or null>,
    "button_description": "<what happens at the button>",
    "hook_technique": "<how the first 3 seconds grab attention>",
    "vertical_grammar_markers": ["<detected markers from the list above>"]
  }"""

    prompt += "\n}\n"

    return prompt


# ── API Call ─────────────────────────────────────────────────────────────

def call_gemini_video(client, prompt: str, video_source: dict) -> str:
    """Send video + prompt to Gemini and return raw response text.

    video_source: {"type": "url", "url": "..."} or {"type": "file", "path": "..."}
    """
    from google.genai import types

    # Build content parts
    parts = []

    if video_source["type"] == "url":
        # YouTube URL — pass as URI part
        parts.append(types.Part.from_uri(
            file_uri=video_source["url"],
            mime_type="video/mp4",
        ))
    elif video_source["type"] == "file":
        # Local file — upload via Files API
        file_path = video_source["path"]
        mime = _guess_video_mime(file_path)
        print(f"  Uploading {Path(file_path).name} ({mime})...", file=sys.stderr)

        uploaded = client.files.upload(file=file_path)

        # Wait for processing
        while uploaded.state == "PROCESSING":
            time.sleep(2)
            uploaded = client.files.get(name=uploaded.name)

        if uploaded.state == "FAILED":
            raise RuntimeError(f"File upload failed: {uploaded.state}")

        parts.append(types.Part.from_uri(
            file_uri=uploaded.uri,
            mime_type=mime,
        ))

    parts.append(prompt)

    # Safety settings — scenes may contain action/violence
    safety = [
        types.SafetySetting(category=cat, threshold="BLOCK_NONE")
        for cat in [
            "HARM_CATEGORY_HARASSMENT",
            "HARM_CATEGORY_HATE_SPEECH",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT",
            "HARM_CATEGORY_DANGEROUS_CONTENT",
        ]
    ]

    # Call with retries
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"  Sending to {MODEL} (attempt {attempt}/{MAX_RETRIES})...", file=sys.stderr)
            response = client.models.generate_content(
                model=MODEL,
                contents=parts,
                config=types.GenerateContentConfig(
                    temperature=TEMPERATURE,
                    max_output_tokens=MAX_OUTPUT_TOKENS,
                    safety_settings=safety,
                    thinking_config=types.ThinkingConfig(thinking_budget=0),
                ),
            )
            resp_text = response.text or ""
            resp_len = len(resp_text)
            print(f"  Response: {resp_len} chars", file=sys.stderr)
            if resp_len < 1000:
                print(f"  WARNING: Very short response — possible safety block or thinking overflow",
                      file=sys.stderr)
                if hasattr(response, 'candidates') and response.candidates:
                    for c in response.candidates:
                        if hasattr(c, 'finish_reason'):
                            print(f"  Finish reason: {c.finish_reason}", file=sys.stderr)
            return resp_text
        except Exception as e:
            err_str = str(e)
            if attempt < MAX_RETRIES:
                delay = RATE_LIMIT_DELAY * attempt * 2
                print(f"  Error: {err_str[:200]}", file=sys.stderr)
                print(f"  Retrying in {delay}s...", file=sys.stderr)
                time.sleep(delay)
            else:
                raise RuntimeError(f"Gemini API failed after {MAX_RETRIES} attempts: {err_str[:300]}")


def _guess_video_mime(path: str) -> str:
    ext = Path(path).suffix.lower()
    return {
        ".mp4": "video/mp4",
        ".mov": "video/quicktime",
        ".avi": "video/x-msvideo",
        ".webm": "video/webm",
        ".mkv": "video/x-matroska",
        ".m4v": "video/mp4",
    }.get(ext, "video/mp4")


# ── Response Parsing ─────────────────────────────────────────────────────

def parse_gemini_response(text: str) -> dict:
    """Extract JSON from Gemini response, handling markdown fences and truncation."""
    if text is None:
        print("ERROR: Gemini returned empty response (likely safety block or video too long)", file=sys.stderr)
        return None
    text = text.strip()

    # Strip markdown code fences
    if text.startswith("```"):
        text = re.sub(r"^```\w*\n?", "", text)
    if text.endswith("```"):
        text = text[:-3]
    text = text.strip()

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Regex fallback — find outermost JSON object
    match = re.search(r"\{[\s\S]*\}", text)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            pass

    # Truncation repair — Gemini may hit token limit mid-JSON.
    # Try progressively closing open structures.
    repaired = _repair_truncated_json(text)
    if repaired:
        print("  (Repaired truncated JSON from Gemini output)", file=sys.stderr)
        return repaired

    # Save raw response for debugging
    debug_path = Path("/tmp/gemini_parse_failure.txt")
    debug_path.write_text(text)
    print(
        f"ERROR: Could not parse Gemini response as JSON.\n"
        f"Response preview: {text[:500]}...\n"
        f"Response tail: ...{text[-200:]}\n"
        f"Saved full response to {debug_path} for debugging",
        file=sys.stderr,
    )
    return None


def _repair_truncated_json(text: str) -> Optional[dict]:
    """Attempt to repair truncated JSON by closing open structures.

    When Gemini hits the output token limit, the JSON is valid up to the
    truncation point. We use multiple strategies, from least to most aggressive.
    """
    # Find the start of the JSON object
    start = text.find("{")
    if start < 0:
        return None

    fragment = text[start:]

    # Strategy 1: Find last complete shot object in the "shots" array.
    # Shot objects end with "}" and the next one starts with "{".
    # Look for the pattern: "},\n    {" or "},\n  {" backwards.
    shot_boundary = None
    for marker in ['},\n      {', '},\n    {', '},\n  {', '},\n{']:
        idx = fragment.rfind(marker)
        if idx > 0:
            shot_boundary = idx + 1  # Include the closing }
            break

    if shot_boundary and shot_boundary > len(fragment) * 0.5:
        # We found a shot boundary in the second half — truncate there
        truncated = fragment[:shot_boundary].rstrip()
        if truncated.endswith(","):
            truncated = truncated[:-1]
        # Close: shots array + patterns array + outer object
        # Count what's open
        result = _close_and_parse(truncated)
        if result:
            n_shots = len(result.get("shots", []))
            print(f"  (Repaired by shot-boundary truncation — {n_shots} shots recovered)",
                  file=sys.stderr)
            return result

    # Strategy 2: Line-based trimming (original approach)
    lines = fragment.split("\n")
    while lines:
        last = lines[-1].rstrip()
        if not last:
            lines.pop()
            continue
        stripped = last.rstrip().rstrip(",")
        if stripped.endswith(("}", "]", '"', "true", "false", "null")) or stripped[-1:].isdigit():
            break
        lines.pop()

    if lines:
        joined = "\n".join(lines).rstrip()
        if joined.endswith(","):
            joined = joined[:-1]
        result = _close_and_parse(joined)
        if result:
            print(f"  (Repaired by line-based truncation)", file=sys.stderr)
            return result

    # Strategy 3: Nesting-aware repair
    return _repair_by_nesting(fragment)


def _close_and_parse(fragment: str) -> Optional[dict]:
    """Close unclosed strings, braces, and brackets then parse."""
    # Track nesting order and string state
    stack = []
    in_string = False
    escape_next = False

    for ch in fragment:
        if escape_next:
            escape_next = False
            continue
        if ch == "\\":
            escape_next = True
            continue
        if ch == '"':
            in_string = not in_string
            continue
        if in_string:
            continue
        if ch in ("{", "["):
            stack.append(ch)
        elif ch == "}" and stack and stack[-1] == "{":
            stack.pop()
        elif ch == "]" and stack and stack[-1] == "[":
            stack.pop()

    # Build suffix: close unclosed string first, then structures
    suffix = ""
    if in_string:
        suffix += '"'  # Close the unclosed string

    if not stack and not in_string:
        try:
            return json.loads(fragment)
        except json.JSONDecodeError:
            return None

    # Remove trailing incomplete content after closing the string
    working = fragment + suffix
    working = working.rstrip()
    if working.endswith(","):
        working = working[:-1]

    # Close structures in reverse nesting order
    for opener in reversed(stack):
        working += "]" if opener == "[" else "}"

    try:
        return json.loads(working)
    except json.JSONDecodeError:
        # Try more aggressive: trim back to last complete key-value pair
        # by finding the last complete line that ends with a value
        lines = working.split("\n")
        for i in range(len(lines) - 1, -1, -1):
            line = lines[i].rstrip().rstrip(",")
            if line.endswith(("}", "]", '"', "true", "false", "null")) or (line and line[-1:].isdigit()):
                trimmed = "\n".join(lines[:i+1])
                trimmed = trimmed.rstrip().rstrip(",")
                # Re-calculate closers from trimmed
                stack2 = []
                in_str2 = False
                esc2 = False
                for ch in trimmed:
                    if esc2: esc2 = False; continue
                    if ch == "\\": esc2 = True; continue
                    if ch == '"': in_str2 = not in_str2; continue
                    if in_str2: continue
                    if ch in ("{", "["): stack2.append(ch)
                    elif ch == "}" and stack2 and stack2[-1] == "{": stack2.pop()
                    elif ch == "]" and stack2 and stack2[-1] == "[": stack2.pop()
                suffix2 = '"' if in_str2 else ""
                for opener in reversed(stack2):
                    suffix2 += "]" if opener == "[" else "}"
                try:
                    return json.loads(trimmed + suffix2)
                except json.JSONDecodeError:
                    continue
        return None


def _repair_by_nesting(text: str) -> Optional[dict]:
    """Nesting-aware truncated JSON repair.

    Tracks the actual open/close order of { and [ to close them correctly.
    """
    # Walk the string tracking nesting, ignoring chars inside strings
    nesting_stack = []
    in_string = False
    escape_next = False

    for ch in text:
        if escape_next:
            escape_next = False
            continue
        if ch == "\\":
            escape_next = True
            continue
        if ch == '"':
            in_string = not in_string
            continue
        if in_string:
            continue
        if ch in ("{", "["):
            nesting_stack.append(ch)
        elif ch == "}":
            if nesting_stack and nesting_stack[-1] == "{":
                nesting_stack.pop()
        elif ch == "]":
            if nesting_stack and nesting_stack[-1] == "[":
                nesting_stack.pop()

    if not nesting_stack:
        # Fully closed already — parse failure is something else
        return None

    # Trim trailing incomplete content
    trimmed = text.rstrip()
    if trimmed.endswith(","):
        trimmed = trimmed[:-1]

    # Close in reverse nesting order
    closers = ""
    for opener in reversed(nesting_stack):
        closers += "]" if opener == "[" else "}"

    try:
        return json.loads(trimmed + closers)
    except json.JSONDecodeError:
        return None


# ── Validation ───────────────────────────────────────────────────────────

def validate_shot_log(data: dict) -> list:
    """Validate a shot log against the schema. Returns list of error strings."""
    errors = []

    # Schema version
    if data.get("schema_version") != "1.0":
        errors.append(f"schema_version must be '1.0', got '{data.get('schema_version')}'")

    # Scene required fields
    scene = data.get("scene")
    if not scene:
        errors.append("Missing 'scene' object")
        return errors

    for field in ["id", "title", "source", "duration_seconds", "genre_tags",
                  "emotional_arc", "narrative_function"]:
        if field not in scene:
            errors.append(f"scene missing required field: {field}")

    source = scene.get("source", {})
    if source.get("medium") and source["medium"] not in MEDIUM_ENUM:
        errors.append(f"scene.source.medium '{source['medium']}' not in enum: {MEDIUM_ENUM}")

    if source.get("format") and source["format"] not in FORMAT_ENUM:
        errors.append(f"scene.source.format '{source['format']}' not in enum: {FORMAT_ENUM}")

    nf = scene.get("narrative_function")
    if nf and nf not in NARRATIVE_FUNCTION_ENUM:
        errors.append(f"scene.narrative_function '{nf}' not in enum")

    # Shots
    shots = data.get("shots", [])
    if not shots:
        errors.append("No shots found")
        return errors

    total_duration_ms = 0
    prev_index = 0

    for i, shot in enumerate(shots):
        prefix = f"shot[{i}]"

        # Sequential index
        idx = shot.get("index", 0)
        if idx != prev_index + 1:
            errors.append(f"{prefix}: index {idx} not sequential (expected {prev_index + 1})")
        prev_index = idx

        # Required fields
        for field in ["index", "duration_ms", "shot_type", "subject",
                      "emotion", "intensity", "beat", "rhythm_role"]:
            if field not in shot:
                errors.append(f"{prefix}: missing required field '{field}'")

        # Enum checks
        if shot.get("shot_type") and shot["shot_type"] not in SHOT_TYPE_ENUM:
            errors.append(f"{prefix}: shot_type '{shot['shot_type']}' not in enum")

        if shot.get("beat") and shot["beat"] not in BEAT_ENUM:
            errors.append(f"{prefix}: beat '{shot['beat']}' not in enum")

        if shot.get("rhythm_role") and shot["rhythm_role"] not in RHYTHM_ROLE_ENUM:
            errors.append(f"{prefix}: rhythm_role '{shot['rhythm_role']}' not in enum")

        # Intensity range
        intensity = shot.get("intensity", 0)
        if not (1 <= intensity <= 10):
            errors.append(f"{prefix}: intensity {intensity} not in 1-10")

        # relationship_to_previous
        rel = shot.get("relationship_to_previous")
        if i == 0:
            if rel is not None:
                errors.append(f"{prefix}: first shot should have null relationship_to_previous")
        else:
            if rel is None:
                errors.append(f"{prefix}: non-first shot has null relationship_to_previous")
            elif isinstance(rel, dict):
                rtype = rel.get("type")
                if rtype and rtype not in RELATIONSHIP_TYPE_ENUM:
                    errors.append(f"{prefix}: relationship type '{rtype}' not in enum")

        # Accumulate duration
        total_duration_ms += shot.get("duration_ms", 0)

    # Duration sanity check
    scene_duration_s = scene.get("duration_seconds", 0)
    if scene_duration_s > 0:
        total_s = total_duration_ms / 1000
        tolerance = scene_duration_s * 0.15
        if abs(total_s - scene_duration_s) > tolerance:
            errors.append(
                f"Shot durations sum to {total_s:.1f}s but scene duration is "
                f"{scene_duration_s:.1f}s (off by more than 15%)"
            )

    # Patterns
    patterns = data.get("patterns_observed", [])
    if not patterns:
        errors.append("No patterns_observed (at least 1 expected)")

    return errors


# ── Calibration ──────────────────────────────────────────────────────────

def run_calibration(corpus_path: str, ground_truth_path: str):
    """Compare Gemini timing vs hand-timed ground truth TSV."""
    # Load corpus file
    with open(corpus_path) as f:
        data = json.load(f)

    shots = data.get("shots", [])
    gemini_durations = {s["index"]: s["duration_ms"] for s in shots}

    # Load ground truth TSV: index, start_ms, end_ms, duration_ms
    gt_durations = {}
    with open(ground_truth_path, newline="") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            idx = int(row["index"])
            gt_durations[idx] = int(row["duration_ms"])

    print(f"Gemini shots: {len(gemini_durations)} | Ground truth shots: {len(gt_durations)}")
    print(f"Shot count match: {'YES' if len(gemini_durations) == len(gt_durations) else 'NO'}")
    print()

    # Per-shot comparison
    abs_errors = []
    within_200 = 0
    within_500 = 0
    total_compared = 0

    common_indices = sorted(set(gemini_durations.keys()) & set(gt_durations.keys()))

    print(f"{'Shot':>5} {'Gemini':>8} {'Truth':>8} {'Error':>8}")
    print("-" * 35)

    for idx in common_indices:
        g = gemini_durations[idx]
        t = gt_durations[idx]
        err = abs(g - t)
        abs_errors.append(err)
        total_compared += 1

        if err <= 200:
            within_200 += 1
        if err <= 500:
            within_500 += 1

        marker = "" if err <= 200 else " *" if err <= 500 else " **"
        print(f"{idx:>5} {g:>7}ms {t:>7}ms {err:>7}ms{marker}")

    if not abs_errors:
        print("No overlapping shots to compare.")
        return 1

    mean_err = sum(abs_errors) / len(abs_errors)
    sorted_err = sorted(abs_errors)
    median_err = sorted_err[len(sorted_err) // 2]

    print()
    print(f"{'='*40}")
    print(f"Shots compared:     {total_compared}")
    print(f"Mean absolute error: {mean_err:.0f}ms")
    print(f"Median abs error:    {median_err:.0f}ms")
    print(f"Within 200ms:        {within_200}/{total_compared} ({100*within_200/total_compared:.0f}%)")
    print(f"Within 500ms:        {within_500}/{total_compared} ({100*within_500/total_compared:.0f}%)")
    print()

    if mean_err < 300:
        print("VERDICT: RESEARCH-GRADE timing. Mean error <300ms.")
        print("Gemini timing is reliable for rhythm analysis.")
    elif mean_err < 1000:
        print("VERDICT: ADVISORY-ONLY timing. Mean error 300-1000ms.")
        print("Use for general rhythm patterns, not precise tempo analysis.")
    else:
        print("VERDICT: UNRELIABLE timing. Mean error >1000ms.")
        print("Use shot count and ordering only. Discard duration data.")

    return 0


# ── Summary ──────────────────────────────────────────────────────────────

def _counter_table(counter, total, label, name_width=18):
    """Print a counter as a formatted table."""
    print(f"\n--- {label} ---")
    for item, count in counter.most_common():
        pct = 100 * count / total if total else 0
        bar = "#" * int(pct / 2)
        print(f"  {str(item):{name_width}s}: {count:3d} ({pct:4.1f}%) {bar}")


def run_summary():
    """Aggregate stats across all corpus files."""
    cd = corpus_dir()
    files = sorted(cd.glob("*.json"))

    if not files:
        print("No corpus files found.", file=sys.stderr)
        return 1

    from collections import Counter

    total_scenes = 0
    total_shots = 0
    excluded_scenes = 0  # opening_grammar etc
    medium_asl = {}  # medium -> [shot_durations]
    pattern_counts = {}  # pattern_name -> count
    genre_counts = {}
    two_peak_count = 0
    microdrama_count = 0
    spike_timestamps = []
    button_timestamps = []
    grammar_markers = {}  # marker -> count

    # Composition counters (aggregate across all scenes)
    all_shot_types = Counter()
    all_angles = Counter()
    all_movements = Counter()
    all_lenses = Counter()
    all_dof = Counter()
    all_relationships = Counter()
    all_rhythm_roles = Counter()
    all_lighting_keys = Counter()
    all_vertical_pos = Counter()
    all_subject_counts = Counter()

    # Per-medium composition
    medium_shot_types = {}  # medium -> Counter

    for fp in files:
        try:
            data = json.load(open(fp))
        except (json.JSONDecodeError, OSError):
            continue

        # Skip scenes flagged as excluded (openings, sizzle reels)
        if data.get("exclude_from_visual_grammar") or data.get("exclude_from_visual_score"):
            excluded_scenes += 1
            continue

        total_scenes += 1
        scene = data.get("scene", {})
        medium = scene.get("source", {}).get("medium", "unknown")
        shots = data.get("shots", [])
        total_shots += len(shots)

        medium_shot_types.setdefault(medium, Counter())

        # ASL by medium
        for s in shots:
            dur = s.get("duration_ms", 0)
            medium_asl.setdefault(medium, []).append(dur)

            # Aggregate composition data
            st = s.get("shot_type", "?")
            all_shot_types[st] += 1
            medium_shot_types[medium][st] += 1

            cam = s.get("camera", {})
            all_angles[cam.get("angle", "?")] += 1
            all_movements[cam.get("movement", "?")] += 1
            all_lenses[cam.get("lens_feel", "?")] += 1
            all_dof[cam.get("depth_of_field", "?")] += 1

            all_rhythm_roles[s.get("rhythm_role", "?")] += 1
            all_lighting_keys[s.get("lighting", {}).get("key", "?")] += 1
            all_vertical_pos[s.get("framing", {}).get("vertical_position", "?")] += 1
            all_subject_counts[s.get("subject_count", 0)] += 1

            rel = s.get("relationship_to_previous")
            if rel and isinstance(rel, dict):
                all_relationships[rel.get("type", "?")] += 1

        # Patterns
        for p in data.get("patterns_observed", []):
            name = p.get("pattern_name", "unknown")
            pattern_counts[name] = pattern_counts.get(name, 0) + 1

        # Genres
        for g in scene.get("genre_tags", []):
            genre_counts[g] = genre_counts.get(g, 0) + 1

        # Two-Peak analysis
        ep_struct = data.get("episode_structure")
        if medium == "microdrama":
            microdrama_count += 1
            if ep_struct and ep_struct.get("has_two_peak"):
                two_peak_count += 1
                if ep_struct.get("spike_timestamp_s"):
                    spike_timestamps.append(ep_struct["spike_timestamp_s"])
                if ep_struct.get("button_timestamp_s"):
                    button_timestamps.append(ep_struct["button_timestamp_s"])

            # Vertical grammar markers
            if ep_struct and ep_struct.get("vertical_grammar_markers"):
                for marker in ep_struct["vertical_grammar_markers"]:
                    grammar_markers[marker] = grammar_markers.get(marker, 0) + 1

    # Print summary
    print(f"{'='*60}")
    print(f"VISUAL GRAMMAR BIBLE CORPUS SUMMARY")
    print(f"{'='*60}")
    print(f"\nTotal scenes:  {total_scenes}")
    if excluded_scenes:
        print(f"Excluded:      {excluded_scenes} (opening_grammar — not in Visual Grammar Bible)")
    print(f"Total shots:   {total_shots}")
    print(f"Corpus files:  {cd}/")

    # ASL by medium
    print(f"\n--- Average Shot Length by Medium ---")
    for medium, durations in sorted(medium_asl.items()):
        if durations:
            avg_ms = sum(durations) / len(durations)
            print(f"  {medium:25s}: {avg_ms:.0f}ms ({avg_ms/1000:.2f}s) — {len(durations)} shots")

    # Composition breakdowns
    _counter_table(all_shot_types, total_shots, "Shot Type Distribution (All Scenes)", 8)
    _counter_table(all_angles, total_shots, "Camera Angle Distribution", 14)
    _counter_table(all_movements, total_shots, "Camera Movement Distribution", 16)
    _counter_table(all_lenses, total_shots, "Lens Feel Distribution", 10)
    _counter_table(all_dof, total_shots, "Depth of Field Distribution", 10)
    _counter_table(all_relationships, total_shots - total_scenes,
                   "Shot-to-Shot Relationship Types", 16)
    _counter_table(all_rhythm_roles, total_shots, "Rhythm Role Distribution", 12)
    _counter_table(all_lighting_keys, total_shots, "Lighting Key Distribution", 16)
    _counter_table(all_vertical_pos, total_shots, "Framing: Vertical Position", 18)

    # Subject count
    print(f"\n--- Subject Count (characters in frame) ---")
    for sc in sorted(all_subject_counts.keys()):
        count = all_subject_counts[sc]
        pct = 100 * count / total_shots if total_shots else 0
        print(f"  {sc} subjects: {count:3d} ({pct:4.1f}%)")

    # Per-medium shot type comparison
    if len(medium_shot_types) > 1:
        print(f"\n--- Shot Type by Medium (%) ---")
        mediums_sorted = sorted(medium_shot_types.keys())
        all_types = sorted(set().union(*[c.keys() for c in medium_shot_types.values()]))
        header = f"  {'':8s}" + "".join(f"{m:>18s}" for m in mediums_sorted)
        print(header)
        for st in all_types:
            row = f"  {st:8s}"
            for m in mediums_sorted:
                total_m = sum(medium_shot_types[m].values())
                count = medium_shot_types[m].get(st, 0)
                pct = 100 * count / total_m if total_m else 0
                row += f"{pct:>17.1f}%"
            print(row)

    # Most common patterns
    print(f"\n--- Most Common Patterns (top 15) ---")
    for name, count in sorted(pattern_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"  {name:35s}: {count}")

    # Genre distribution
    print(f"\n--- Genre Distribution ---")
    for genre, count in sorted(genre_counts.items(), key=lambda x: -x[1]):
        print(f"  {genre:25s}: {count}")

    # Two-Peak analysis
    if microdrama_count > 0:
        print(f"\n--- Two-Peak Structure (microdrama only) ---")
        print(f"  Microdramas analyzed: {microdrama_count}")
        print(f"  Two-peak detected:    {two_peak_count} ({100*two_peak_count/microdrama_count:.0f}%)")
        if spike_timestamps:
            avg_spike = sum(spike_timestamps) / len(spike_timestamps)
            print(f"  Avg spike timestamp:  {avg_spike:.1f}s (theory: 45-55s)")
        if button_timestamps:
            avg_button = sum(button_timestamps) / len(button_timestamps)
            print(f"  Avg button timestamp: {avg_button:.1f}s (theory: 85-90s)")

        # Grammar markers
        if grammar_markers:
            print(f"\n--- Vertical Grammar Markers ---")
            for marker, count in sorted(grammar_markers.items(), key=lambda x: -x[1]):
                pct = 100 * count / microdrama_count
                print(f"  {marker:30s}: {count}/{microdrama_count} ({pct:.0f}%)")

    print(f"\n{'='*60}")
    return 0


# ── Single Scene Analysis ────────────────────────────────────────────────

def analyze_scene(
    client,
    video_source: dict,
    scene_id: str,
    title: str,
    work: str,
    medium: str,
    genre_tags: list,
    format_ratio: str = "16:9",
    url: Optional[str] = None,
    dry_run: bool = False,
) -> dict:
    """Analyze a single scene and return the parsed shot log."""

    prompt = build_analysis_prompt(
        scene_id=scene_id,
        title=title,
        work=work,
        medium=medium,
        genre_tags=genre_tags,
        format_ratio=format_ratio,
    )

    if dry_run:
        print(f"--- DRY RUN: {scene_id} ---")
        print(f"Prompt length: {len(prompt)} chars")
        print(f"Video source: {video_source}")
        print(f"Prompt preview (first 500 chars):\n{prompt[:500]}...")
        return {}

    print(f"\nAnalyzing: {scene_id} — {title}", file=sys.stderr)
    print(f"  Medium: {medium} | Genres: {genre_tags}", file=sys.stderr)

    response_text = call_gemini_video(client, prompt, video_source)
    data = parse_gemini_response(response_text)
    if data is None:
        print(f"ERROR: Failed to parse Gemini response for {scene_id}", file=sys.stderr)
        print(f"  Response length: {len(response_text)} chars", file=sys.stderr)
        return 1

    # Inject URL if we have one
    if url and "scene" in data and "source" in data["scene"]:
        data["scene"]["source"]["url"] = url

    # Validate
    errors = validate_shot_log(data)
    if errors:
        print(f"\n  VALIDATION WARNINGS ({len(errors)}):", file=sys.stderr)
        for e in errors[:10]:
            print(f"    - {e}", file=sys.stderr)
        if len(errors) > 10:
            print(f"    ... and {len(errors) - 10} more", file=sys.stderr)

    # Save
    output_path = corpus_dir() / f"{scene_id}.json"
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"  Saved: {output_path}", file=sys.stderr)

    # Print summary
    shots = data.get("shots", [])
    patterns = data.get("patterns_observed", [])
    duration = data.get("scene", {}).get("duration_seconds", 0)
    asl = (sum(s.get("duration_ms", 0) for s in shots) / len(shots) / 1000) if shots else 0

    print(f"\n  Shots: {len(shots)}", file=sys.stderr)
    print(f"  Duration: {duration:.1f}s", file=sys.stderr)
    print(f"  ASL: {asl:.2f}s", file=sys.stderr)
    print(f"  Patterns: {len(patterns)}", file=sys.stderr)
    if patterns:
        for p in patterns[:5]:
            print(f"    - {p.get('pattern_name')}: {p.get('emotional_function', '')}", file=sys.stderr)
    print(f"  Validation errors: {len(errors)}", file=sys.stderr)

    ep_struct = data.get("episode_structure")
    if ep_struct:
        if ep_struct.get("has_two_peak"):
            print(f"  Two-Peak: Spike@{ep_struct.get('spike_timestamp_s')}s, "
                  f"Button@{ep_struct.get('button_timestamp_s')}s", file=sys.stderr)
        else:
            print(f"  Two-Peak: NOT detected", file=sys.stderr)

    return data


# ── Batch Mode ───────────────────────────────────────────────────────────

def run_batch(tsv_path: str, dry_run: bool = False):
    """Process a batch of scenes from a TSV file.

    TSV columns: id, url_or_path, title, work, medium, genre_tags, format
    genre_tags is comma-separated within the field.
    """
    rows = []
    with open(tsv_path, newline="") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            rows.append(row)

    if not rows:
        print("ERROR: No rows in TSV file.", file=sys.stderr)
        return 1

    print(f"Batch: {len(rows)} scenes from {tsv_path}", file=sys.stderr)

    client = None if dry_run else get_client()
    results = {"success": 0, "failed": 0, "skipped": 0}

    for i, row in enumerate(rows):
        scene_id = row.get("id", "").strip()
        url_or_path = row.get("url_or_path", "").strip()
        title = row.get("title", "").strip()
        work = row.get("work", "").strip()
        medium = row.get("medium", "").strip()
        genre_str = row.get("genre_tags", "")
        genre_tags = [g.strip() for g in genre_str.split(",") if g.strip()]
        format_ratio = row.get("format", "16:9").strip()

        if not scene_id or not url_or_path:
            print(f"  [{i+1}/{len(rows)}] SKIP: missing id or url_or_path", file=sys.stderr)
            results["skipped"] += 1
            continue

        # Check if already in corpus
        existing = corpus_dir() / f"{scene_id}.json"
        if existing.exists() and not dry_run:
            print(f"  [{i+1}/{len(rows)}] SKIP: {scene_id} already in corpus", file=sys.stderr)
            results["skipped"] += 1
            continue

        # Determine video source
        if url_or_path.startswith("http"):
            video_source = {"type": "url", "url": url_or_path}
            url = url_or_path
        else:
            if not Path(url_or_path).exists():
                print(f"  [{i+1}/{len(rows)}] SKIP: file not found: {url_or_path}", file=sys.stderr)
                results["skipped"] += 1
                continue
            video_source = {"type": "file", "path": url_or_path}
            url = None

        print(f"\n  [{i+1}/{len(rows)}] {scene_id}", file=sys.stderr)

        try:
            analyze_scene(
                client=client,
                video_source=video_source,
                scene_id=scene_id,
                title=title,
                work=work,
                medium=medium,
                genre_tags=genre_tags,
                format_ratio=format_ratio,
                url=url,
                dry_run=dry_run,
            )
            results["success"] += 1
        except Exception as e:
            print(f"  ERROR: {e}", file=sys.stderr)
            err_str = str(e)
            if "YouTube" in err_str or "not accessible" in err_str or "403" in err_str:
                print(
                    f"\n  YouTube URL not accessible. Download with:\n"
                    f"    yt-dlp -o video.mp4 \"{url_or_path}\"\n"
                    f"  Then update TSV with local file path.\n",
                    file=sys.stderr,
                )
            results["failed"] += 1

        # Rate limit between scenes
        if i < len(rows) - 1 and not dry_run:
            time.sleep(RATE_LIMIT_DELAY)

    print(f"\n{'='*40}", file=sys.stderr)
    print(f"Batch complete: {results['success']} success, "
          f"{results['failed']} failed, {results['skipped']} skipped", file=sys.stderr)
    return 0 if results["failed"] == 0 else 1


# ── Pattern Extraction (Second Pass) ────────────────────────────────────

def build_pattern_prompt(data: dict) -> str:
    """Build a text-only prompt for deep pattern extraction from an existing shot log."""
    scene = data.get("scene", {})
    shots = data.get("shots", [])

    # Compact shot summary — enough for pattern detection without full JSON bulk
    shot_lines = []
    for s in shots:
        idx = s.get("index", 0)
        dur = s.get("duration_ms", 0)
        st = s.get("shot_type", "?")
        subj = s.get("subject", "?")[:40]
        emotion = s.get("emotion", "?")
        intensity = s.get("intensity", 0)
        beat = s.get("beat", "?")
        rhythm = s.get("rhythm_role", "?")
        angle = s.get("camera", {}).get("angle", "?")
        movement = s.get("camera", {}).get("movement", "?")
        dof = s.get("camera", {}).get("depth_of_field", "?")
        rel = s.get("relationship_to_previous")
        rel_type = rel.get("type", "?") if isinstance(rel, dict) else "first"
        rel_dim = rel.get("dimension", "") if isinstance(rel, dict) else ""

        shot_lines.append(
            f"  {idx:3d} | {dur:5d}ms | {st:5s} | {angle:10s} | {movement:12s} | "
            f"{dof:7s} | {emotion:12s} | {intensity:2d} | {beat:12s} | {rhythm:10s} | "
            f"{rel_type:12s} | {rel_dim:10s} | {subj}"
        )

    shots_table = "\n".join(shot_lines)
    header = (
        f"  {'#':>3s} | {'dur':>5s}   | {'type':5s} | {'angle':10s} | {'movement':12s} | "
        f"{'dof':7s} | {'emotion':12s} | {'i':2s} | {'beat':12s} | {'rhythm':10s} | "
        f"{'rel_type':12s} | {'rel_dim':10s} | subject"
    )

    prompt = f"""# ROLE

You are an expert film editor and cinematographer. You have already analyzed a scene
shot-by-shot. Now you are doing a SECOND PASS focused exclusively on identifying
multi-shot PATTERNS — recurring editorial strategies, rhythmic structures, and
cinematic grammar rules.

# SCENE CONTEXT

- Title: {scene.get('title', '?')}
- Work: {scene.get('source', {}).get('work', '?')}
- Medium: {scene.get('source', {}).get('medium', '?')}
- Duration: {scene.get('duration_seconds', 0)}s
- Total shots: {len(shots)}
- Emotional arc: {scene.get('emotional_arc', '?')}

# SHOT DATA

{header}
{'-'*160}
{shots_table}

# YOUR TASK

Analyze this shot data for PATTERNS. Look for:

1. **Scale Patterns**: How does shot size change across sequences?
   - Scale ladders (progressive CU→MS→LS or reverse)
   - Scale oscillation (alternating tight/wide)
   - Scale shock (sudden jump from ECU to WIDE or vice versa)

2. **Rhythm Patterns**: How do durations create pacing?
   - Compression sequences (shots getting progressively shorter)
   - Expansion sequences (shots getting progressively longer)
   - Pulse patterns (regular alternation of long/short)
   - Staccato bursts (<500ms shots in sequence)
   - Sustained holds (single shot >4000ms) and what triggers them

3. **Relationship Sequences**: What editorial grammar is used?
   - Identify 3-5 shot relationship chains (e.g., contrast→continuation→continuation→contrast)
   - What does each chain accomplish emotionally?

4. **Camera Behavior Patterns**: How does the camera relate to emotion?
   - When does the camera move vs stay static?
   - When does angle change (low/high) and what triggers it?
   - When does DoF shift (shallow/deep) and what does it signal?

5. **Intensity Mapping**: How does visual grammar map to emotional intensity?
   - What shot types appear at intensity 9-10 vs 4-6?
   - What triggers intensity spikes and drops?

6. **Structural Patterns**: Scene-level architecture
   - How does the scene open? (first 3-5 shots)
   - How does it build? (middle section)
   - How does it climax? (peak sequence)
   - How does it resolve? (final 3-5 shots)

# OUTPUT FORMAT

Return ONLY valid JSON (no markdown fences). Structure:

{{
  "patterns": [
    {{
      "pattern_name": "snake_case_name",
      "description": "What this pattern IS and WHY it works cinematically",
      "shot_range": [start_index, end_index],
      "emotional_function": "What emotional effect it creates",
      "rhythm_signature": "Compact notation (e.g., 'rapid_rapid_rapid_HOLD_breath')",
      "relationship_sequence": ["type1", "type2", ...],
      "scale_sequence": ["shot_type1", "shot_type2", ...],
      "transferability": "How well this transfers to vertical microdrama (9:16)",
      "confidence": "high|medium|low"
    }}
  ],
  "scale_grammar": {{
    "dominant_shot_type": "Most used shot type and why",
    "scale_transitions": "How does the scene move between shot sizes?",
    "scale_rules": ["Rule 1", "Rule 2"]
  }},
  "rhythm_grammar": {{
    "base_tempo_ms": <median shot duration>,
    "fastest_sequence_ms": <shortest shot in scene>,
    "longest_hold_ms": <longest shot>,
    "acceleration_triggers": "What causes cuts to speed up?",
    "deceleration_triggers": "What causes cuts to slow down?"
  }},
  "camera_grammar": {{
    "movement_rules": ["When camera moves and why"],
    "angle_rules": ["When angle changes and why"],
    "dof_rules": ["When DoF changes and why"]
  }},
  "scene_architecture": {{
    "opening_strategy": "How the first 3-5 shots establish the scene",
    "build_strategy": "How the middle escalates",
    "climax_strategy": "How the peak sequence is constructed",
    "resolution_strategy": "How the final shots land"
  }}
}}

Be thorough. Identify at least 5 patterns. This is the analysis that matters for
building the Visual Grammar Bible.
"""
    return prompt


def run_pattern_extraction(corpus_path: str):
    """Second-pass pattern extraction from an existing corpus JSON file (text-only, no video)."""
    try:
        with open(corpus_path) as f:
            data = json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        print(f"ERROR: Could not read {corpus_path}: {e}", file=sys.stderr)
        return 2

    scene_id = data.get("scene", {}).get("id", "unknown")
    print(f"Pattern extraction: {scene_id}", file=sys.stderr)
    print(f"  Shots: {len(data.get('shots', []))}", file=sys.stderr)

    prompt = build_pattern_prompt(data)
    print(f"  Prompt: {len(prompt)} chars", file=sys.stderr)

    client = get_client()

    from google.genai import types

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"  Sending to {MODEL} (text-only, attempt {attempt}/{MAX_RETRIES})...",
                  file=sys.stderr)
            response = client.models.generate_content(
                model=MODEL,
                contents=[prompt],
                config=types.GenerateContentConfig(
                    temperature=0.3,
                    max_output_tokens=MAX_OUTPUT_TOKENS,
                ),
            )
            break
        except Exception as e:
            if attempt < MAX_RETRIES:
                delay = RATE_LIMIT_DELAY * attempt * 2
                print(f"  Error: {str(e)[:200]}", file=sys.stderr)
                print(f"  Retrying in {delay}s...", file=sys.stderr)
                time.sleep(delay)
            else:
                print(f"ERROR: Gemini API failed after {MAX_RETRIES} attempts: {str(e)[:300]}",
                      file=sys.stderr)
                return 1

    result = parse_gemini_response(response.text)
    if result is None:
        print(f"ERROR: Failed to parse pattern extraction response for {scene_id}", file=sys.stderr)
        print(f"  Response length: {len(response.text)} chars", file=sys.stderr)
        print(f"  Response start: {response.text[:200]}", file=sys.stderr)
        return 1

    # Merge patterns into the corpus file
    new_patterns = result.get("patterns", [])
    existing_patterns = data.get("patterns_observed", [])

    # Convert new format to schema format
    merged_patterns = list(existing_patterns)
    existing_names = {p.get("pattern_name") for p in existing_patterns}

    for p in new_patterns:
        if p.get("pattern_name") not in existing_names:
            merged_patterns.append({
                "pattern_name": p.get("pattern_name", "unknown"),
                "description": p.get("description", ""),
                "shot_range": p.get("shot_range", [0, 0]),
                "emotional_function": p.get("emotional_function", ""),
                "rhythm_signature": p.get("rhythm_signature", ""),
                "relationship_sequence": p.get("relationship_sequence", []),
            })

    data["patterns_observed"] = merged_patterns

    # Store the full analysis as supplementary data
    data["pattern_analysis"] = {
        "scale_grammar": result.get("scale_grammar", {}),
        "rhythm_grammar": result.get("rhythm_grammar", {}),
        "camera_grammar": result.get("camera_grammar", {}),
        "scene_architecture": result.get("scene_architecture", {}),
        "extracted_at": datetime.now(timezone.utc).isoformat(),
    }

    # Save back
    output_path = Path(corpus_path)
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"\n  Patterns before: {len(existing_patterns)}", file=sys.stderr)
    print(f"  Patterns after:  {len(merged_patterns)} (+{len(merged_patterns) - len(existing_patterns)} new)",
          file=sys.stderr)
    print(f"  Saved: {output_path}", file=sys.stderr)

    # Print pattern summary
    print(f"\n--- Extracted Patterns ---", file=sys.stderr)
    for p in new_patterns:
        conf = p.get("confidence", "?")
        name = p.get("pattern_name", "?")
        func = p.get("emotional_function", "?")
        rng = p.get("shot_range", [])
        print(f"  [{conf:6s}] {name}: {func} (shots {rng})", file=sys.stderr)

    # Print grammar summaries
    for section in ["scale_grammar", "rhythm_grammar", "camera_grammar", "scene_architecture"]:
        section_data = result.get(section, {})
        if section_data:
            print(f"\n--- {section.replace('_', ' ').title()} ---", file=sys.stderr)
            for k, v in section_data.items():
                if isinstance(v, list):
                    print(f"  {k}:", file=sys.stderr)
                    for item in v:
                        print(f"    - {item}", file=sys.stderr)
                else:
                    print(f"  {k}: {v}", file=sys.stderr)

    return 0


# ── Episode Detection (Microdrama Compilations) ─────────────────────────

def run_detect_episodes(video_path: str, output_dir: str = None):
    """Use Gemini to detect episode boundaries in a microdrama compilation.

    Sends the video to Gemini and asks it to identify freeze frames, title cards,
    payment/coin-lock screens, and other episode boundary markers. Returns timecodes
    for each episode, then clips with ffmpeg.
    """
    import shutil
    import subprocess

    video_path = Path(video_path)
    if not video_path.exists():
        print(f"ERROR: Video file not found: {video_path}", file=sys.stderr)
        return 1

    # Check ffmpeg
    if not shutil.which("ffmpeg"):
        print("ERROR: ffmpeg not installed. Run: brew install ffmpeg", file=sys.stderr)
        return 1

    if output_dir:
        out_dir = Path(output_dir)
    else:
        out_dir = video_path.parent / f"{video_path.stem}_episodes"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Upload video to Gemini
    client = get_client()
    from google.genai import types

    mime = _guess_video_mime(str(video_path))
    size_mb = video_path.stat().st_size / (1024 * 1024)
    print(f"Uploading {video_path.name} ({size_mb:.1f} MB, {mime})...", file=sys.stderr)

    uploaded = client.files.upload(file=str(video_path))
    while uploaded.state == "PROCESSING":
        time.sleep(2)
        uploaded = client.files.get(name=uploaded.name)
    if uploaded.state == "FAILED":
        print(f"ERROR: File upload failed", file=sys.stderr)
        return 1

    print(f"Upload complete. Detecting episode boundaries...", file=sys.stderr)

    prompt = """# TASK

You are analyzing a compilation video that contains multiple short microdrama episodes
edited together. Your job is to identify where each individual episode begins and ends.

# EPISODE BOUNDARY MARKERS

Look for these visual cues that indicate episode boundaries:
- **Freeze frames** at the end of an episode (image holds for 1-3 seconds)
- **Title cards** or text overlays showing episode titles/numbers
- **Coin-lock / payment screens** ("Unlock next episode", coin icons)
- **Black screens** or fade-to-black transitions between episodes
- **App UI overlays** (like/comment/share buttons, progress bars)
- **"Next episode" previews** or recap sequences
- **Dramatic music stings** followed by silence or new music
- **Complete tonal/scene shifts** (new location, new characters, new conflict)

# OUTPUT FORMAT

Return a JSON array of episode objects. For each episode:
```json
{
  "episodes": [
    {
      "episode_number": 1,
      "start_time_s": 0.0,
      "end_time_s": 65.5,
      "title": "Episode 1 title if visible, otherwise null",
      "boundary_type": "freeze_frame|title_card|coin_lock|black_screen|fade|unknown",
      "notes": "Brief description of what happens"
    }
  ],
  "total_duration_s": 1800.0,
  "compilation_type": "single_series|mixed_series|unknown",
  "series_title": "Title if identifiable",
  "confidence": "high|medium|low"
}
```

Be precise with timestamps (to 0.5s accuracy). It's better to include a boundary
you're unsure about than to miss one. Each episode in a microdrama is typically
60-120 seconds long.

Return ONLY the JSON, no explanation."""

    safety = [
        types.SafetySetting(category=cat, threshold="BLOCK_NONE")
        for cat in [
            "HARM_CATEGORY_HARASSMENT",
            "HARM_CATEGORY_HATE_SPEECH",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT",
            "HARM_CATEGORY_DANGEROUS_CONTENT",
        ]
    ]

    parts = [
        types.Part.from_uri(file_uri=uploaded.uri, mime_type=mime),
        prompt,
    ]

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"  Sending to {MODEL} (attempt {attempt}/{MAX_RETRIES})...", file=sys.stderr)
            response = client.models.generate_content(
                model=MODEL,
                contents=parts,
                config=types.GenerateContentConfig(
                    temperature=0.2,
                    max_output_tokens=8192,
                    safety_settings=safety,
                ),
            )
            break
        except Exception as e:
            if attempt < MAX_RETRIES:
                delay = RATE_LIMIT_DELAY * attempt * 2
                print(f"  Error: {str(e)[:200]}", file=sys.stderr)
                print(f"  Retrying in {delay}s...", file=sys.stderr)
                time.sleep(delay)
            else:
                print(f"ERROR: Gemini API failed: {str(e)[:300]}", file=sys.stderr)
                return 1

    # Check for safety blocks or empty responses
    if not response.text:
        print(f"ERROR: Gemini returned empty response.", file=sys.stderr)
        if hasattr(response, 'candidates') and response.candidates:
            for c in response.candidates:
                if hasattr(c, 'finish_reason'):
                    print(f"  Finish reason: {c.finish_reason}", file=sys.stderr)
                if hasattr(c, 'safety_ratings'):
                    print(f"  Safety ratings: {c.safety_ratings}", file=sys.stderr)
        print("  Video may be too long or content may have been blocked.", file=sys.stderr)
        return 1

    result = parse_gemini_response(response.text)
    if result is None:
        print(f"ERROR: Could not parse episode detection response", file=sys.stderr)
        return 1

    episodes = result.get("episodes", [])
    if not episodes:
        print("ERROR: No episodes detected in video", file=sys.stderr)
        return 1

    print(f"\nDetected {len(episodes)} episodes:", file=sys.stderr)
    for ep in episodes:
        start = ep.get("start_time_s", 0)
        end = ep.get("end_time_s", 0)
        dur = end - start
        title = ep.get("title") or f"Episode {ep.get('episode_number', '?')}"
        boundary = ep.get("boundary_type", "?")
        print(f"  [{start:6.1f}s - {end:6.1f}s] ({dur:.0f}s) {title} [{boundary}]", file=sys.stderr)

    # Save detection results
    detect_json = out_dir / "episode_detection.json"
    with open(detect_json, "w") as f:
        json.dump(result, f, indent=2)
    print(f"\nSaved detection results: {detect_json}", file=sys.stderr)

    # Clip episodes with ffmpeg
    print(f"\nClipping {len(episodes)} episodes to {out_dir}/...", file=sys.stderr)
    clipped = 0
    for ep in episodes:
        num = ep.get("episode_number", clipped + 1)
        start = ep.get("start_time_s", 0)
        end = ep.get("end_time_s", 0)
        dur = end - start

        if dur < 15:
            print(f"  SKIP ep{num:02d}: too short ({dur:.0f}s)", file=sys.stderr)
            continue
        if dur > 300:
            print(f"  SKIP ep{num:02d}: too long ({dur:.0f}s) — likely not a single episode", file=sys.stderr)
            continue

        out_file = out_dir / f"ep{num:02d}_{video_path.stem}.mp4"
        if out_file.exists():
            print(f"  SKIP ep{num:02d}: already clipped", file=sys.stderr)
            clipped += 1
            continue

        try:
            cmd = [
                "ffmpeg", "-y",
                "-ss", str(start),
                "-i", str(video_path),
                "-t", str(dur),
                "-c", "copy",
                "-avoid_negative_ts", "make_zero",
                str(out_file),
            ]
            subprocess.run(cmd, capture_output=True, timeout=60)

            if out_file.exists() and out_file.stat().st_size > 10_000:
                size_mb = out_file.stat().st_size / (1024 * 1024)
                print(f"  OK ep{num:02d}: {dur:.0f}s, {size_mb:.1f} MB → {out_file.name}", file=sys.stderr)
                clipped += 1
            else:
                print(f"  FAIL ep{num:02d}: output file missing or too small", file=sys.stderr)
        except subprocess.TimeoutExpired:
            print(f"  FAIL ep{num:02d}: ffmpeg timeout", file=sys.stderr)
        except Exception as e:
            print(f"  FAIL ep{num:02d}: {e}", file=sys.stderr)

    print(f"\nClipped {clipped}/{len(episodes)} episodes to {out_dir}/", file=sys.stderr)
    return 0


# ── CLI ──────────────────────────────────────────────────────────────────

def build_parser():
    parser = argparse.ArgumentParser(
        description="Gemini Video Analysis for Visual Grammar Bible Research",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    # Single scene options
    parser.add_argument("--url", help="YouTube URL to analyze")
    parser.add_argument("--file", help="Local video file to analyze")
    parser.add_argument("--id", dest="scene_id", help="Unique scene ID (snake_case)")
    parser.add_argument("--title", help="Human-readable scene title")
    parser.add_argument("--work", default="", help="Source work title")
    parser.add_argument("--medium", choices=MEDIUM_ENUM, help="Medium type")
    parser.add_argument("--genre", default="", help="Comma-separated genre tags")
    parser.add_argument("--format", dest="format_ratio", default="16:9",
                        choices=FORMAT_ENUM, help="Aspect ratio (default: 16:9)")

    # Batch mode
    parser.add_argument("--batch", metavar="TSV", help="Batch analysis from TSV file")

    # Validate mode
    parser.add_argument("--validate", metavar="JSON", help="Validate existing corpus file")

    # Summary mode
    parser.add_argument("--summary", action="store_true", help="Aggregate stats across corpus")

    # Pattern extraction mode (second pass, text-only)
    parser.add_argument("--patterns", metavar="JSON",
                        help="Second-pass pattern extraction from existing corpus file (text-only)")

    # Episode detection mode (microdrama compilations)
    parser.add_argument("--detect-episodes", metavar="VIDEO",
                        help="Detect episode boundaries in a microdrama compilation video and clip with ffmpeg")
    parser.add_argument("--output-dir", metavar="DIR",
                        help="Output directory for clipped episodes (default: {video}_episodes/)")

    # Calibration mode
    parser.add_argument("--calibrate", nargs=2, metavar=("CORPUS_JSON", "GROUND_TRUTH_TSV"),
                        help="Compare timing accuracy vs hand-timed data")

    # Options
    parser.add_argument("--dry-run", action="store_true", help="Show what would be done")

    return parser


def main():
    parser = build_parser()
    args = parser.parse_args()

    # ── Summary mode ──
    if args.summary:
        return run_summary()

    # ── Calibration mode ──
    if args.calibrate:
        return run_calibration(args.calibrate[0], args.calibrate[1])

    # ── Episode detection mode ──
    if args.detect_episodes:
        return run_detect_episodes(args.detect_episodes, args.output_dir)

    # ── Pattern extraction mode ──
    if args.patterns:
        return run_pattern_extraction(args.patterns)

    # ── Validate mode ──
    if args.validate:
        try:
            with open(args.validate) as f:
                data = json.load(f)
        except (json.JSONDecodeError, OSError) as e:
            print(f"ERROR: Could not read {args.validate}: {e}", file=sys.stderr)
            return 2

        errors = validate_shot_log(data)
        if errors:
            print(f"VALIDATION ERRORS ({len(errors)}):")
            for e in errors:
                print(f"  - {e}")
            return 1
        else:
            shots = data.get("shots", [])
            patterns = data.get("patterns_observed", [])
            print(f"VALID: {len(shots)} shots, {len(patterns)} patterns, 0 errors")
            return 0

    # ── Batch mode ──
    if args.batch:
        return run_batch(args.batch, dry_run=args.dry_run)

    # ── Single scene mode ──
    if not args.url and not args.file:
        parser.print_help()
        return 0

    # Validate required args for single scene
    if not args.scene_id:
        print("ERROR: --id required for single scene analysis", file=sys.stderr)
        return 2
    if not args.medium:
        print("ERROR: --medium required for single scene analysis", file=sys.stderr)
        return 2

    title = args.title or args.scene_id
    genre_tags = [g.strip() for g in args.genre.split(",") if g.strip()]

    if args.url:
        video_source = {"type": "url", "url": args.url}
        url = args.url
    else:
        file_path = args.file
        if not Path(file_path).exists():
            print(f"ERROR: File not found: {file_path}", file=sys.stderr)
            return 2
        video_source = {"type": "file", "path": file_path}
        url = None

    if args.dry_run:
        analyze_scene(
            client=None,
            video_source=video_source,
            scene_id=args.scene_id,
            title=title,
            work=args.work,
            medium=args.medium,
            genre_tags=genre_tags,
            format_ratio=args.format_ratio,
            url=url,
            dry_run=True,
        )
        return 0

    client = get_client()

    try:
        analyze_scene(
            client=client,
            video_source=video_source,
            scene_id=args.scene_id,
            title=title,
            work=args.work,
            medium=args.medium,
            genre_tags=genre_tags,
            format_ratio=args.format_ratio,
            url=url,
        )
    except Exception as e:
        err_str = str(e)
        print(f"\nERROR: {err_str}", file=sys.stderr)

        # YouTube fallback suggestion
        if args.url and ("not accessible" in err_str or "403" in err_str
                         or "YouTube" in err_str or "video" in err_str.lower()):
            print(
                f"\nYouTube URL not accessible. Download with:\n"
                f"  yt-dlp -o video.mp4 \"{args.url}\"\n"
                f"Then rerun with: --file video.mp4",
                file=sys.stderr,
            )
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())