#!/usr/bin/python3
"""
Dramatic QC Gate - Post-Batch Quality Assessment

This script runs dramatic quality checks on generated episodes.
Unlike validate_behavioral_dna.py (hard gate), this is a SOFT GATE:
- Reports issues but doesn't block generation
- Provides /rewrite commands for fixing
- Can be integrated into checkpoint workflow

Checks:
1. Voice distinctiveness (swap test approximation)
2. Emotional register variety
3. Declaration timing (earned vs declared)
4. Theme statement detection

Usage:
  python3 dramatic_qc_gate.py <project_path> --batch <N>
  python3 dramatic_qc_gate.py <project_path> --ep <N>
  python3 dramatic_qc_gate.py <project_path> --full

Arguments:
  project_path   Path to production project
  --batch N      Check batch N (episodes N*5-4 to N*5)
  --ep N         Check single episode N
  --full         Check all episodes

Returns:
- Exit code 0: No MUST FIX issues found
- Exit code 1: MUST FIX issues found (soft fail - reports but doesn't block)
- Exit code 2: Configuration/path error
"""

import random
import sys
import re
import json
from pathlib import Path
from collections import defaultdict

# Import batch size, shared dialogue parsing, and LLM helpers from engine constants
try:
    sys.path.insert(0, str(Path(__file__).resolve().parent.parent / 'tools'))
    from engine_constants import (
        GENERATION_BATCH_SIZE,
        ANTHROPIC_SONNET, ANTHROPIC_HAIKU,
        get_anthropic_client,
        call_anthropic,
        parse_llm_field,
        extract_script_content,
        parse_dialogue_blocks as _shared_parse_dialogue_blocks,
    )
    _USE_SHARED = True
except ImportError:
    GENERATION_BATCH_SIZE = 5
    ANTHROPIC_SONNET = "claude-sonnet-4-6"
    ANTHROPIC_HAIKU = "claude-haiku-4-5-20251001"
    get_anthropic_client = lambda: None
    call_anthropic = lambda client, model, prompt, max_tokens=200: None
    parse_llm_field = lambda result, field, expected=None: None
    extract_script_content = lambda content: content
    _USE_SHARED = False

# Early relationship declarations that are red flags
# Thresholds from CONSTANTS.md → Relationship Earning Schedule
EARLY_DECLARATION_PHRASES = {
    'i love you': 51,              # Major declarations: episodes 51-60
    'i trust you completely': 21,  # Tentative trust: episodes 21-30
    'you\'re my family': 41,       # Deep connection: episodes 41-50
    'i need you': 31,              # Significant statements: episodes 31-40
    'you mean everything': 51,     # Major declarations: episodes 51-60
    'i see you completely': 41,    # Deep connection: episodes 41-50
    'you\'re all i have': 41,      # Deep connection: episodes 41-50
}

# Theme statement red flags (characters shouldn't state themes directly)
THEME_STATEMENT_PATTERNS = [
    r'you can never trust',
    r'trust is (?:earned|broken|everything)',
    r'that\'s what (?:love|trust|friendship) (?:really )?means',
    r'this is what (?:power|greed|love) does',
    r'we(?:\'re| are) all just',
    r'in the end,? (?:we|everyone|people)',
    r'the (?:truth|lesson|moral) is',
]

# Generic AI dialogue patterns
GENERIC_AI_PATTERNS = [
    r'i have calculated',
    r'processing (?:your|the) request',
    r'my analysis indicates',
    r'probability of success',
    r'baseline parameters',
    r'optimal solution',
    r'executing command',
    r'data insufficient',
]

# Generic action dialogue
GENERIC_ACTION_PATTERNS = [
    r'^we need to (?:move|go|run|leave)',
    r'^let\'s go\.?$',
    r'^come on\.?$',
    r'^watch out\.?$',
    r'^be careful\.?$',
    r'^stay here\.?$',
    r'^get down\.?$',
]


def extract_script_content(content):
    """Extract the fountain script content from the episode file."""
    fountain_match = re.search(r'```fountain\s*(.*?)```', content, re.DOTALL)
    if fountain_match:
        return fountain_match.group(1)

    script_match = re.search(r'## SCRIPT\s*(.*?)(?=##|$)', content, re.DOTALL)
    if script_match:
        return script_match.group(1)

    return content


def extract_dialogue_lines(script):
    """Extract character dialogue from script.
    Uses shared parse_dialogue_blocks from engine_constants when available,
    which correctly handles multi-line dialogue (fixes bug where
    current_speaker was reset after first dialogue line)."""
    dialogue = []

    if _USE_SHARED:
        # Use shared parser — correctly handles multi-line dialogue blocks
        blocks = _shared_parse_dialogue_blocks(script)
        for speaker, dialogue_text in blocks:
            # Split multi-line dialogue into individual lines for downstream consumers
            for line in dialogue_text.split('\n'):
                line = line.strip()
                if line:
                    dialogue.append({
                        'speaker': speaker,
                        'line': line,
                    })
        return dialogue

    # Fallback: original implementation (has multi-line dialogue bug)
    lines = script.split('\n')
    current_speaker = None

    for i, line in enumerate(lines):
        stripped = line.strip()

        # Character name (ALL CAPS, not a scene heading)
        if stripped.isupper() and len(stripped) < 30:
            if not stripped.startswith(('INT.', 'EXT.', '.', '#')):
                if stripped not in ['ECU', 'CU', 'MCU', 'MS', 'WS', 'POV', 'SFX',
                                   'VFX', 'INSERT', 'CONTINUOUS', 'LATER',
                                   'NIGHT', 'DAY', 'MORNING', 'EVENING']:
                    current_speaker = stripped
                    continue

        # Dialogue line
        if current_speaker and stripped and not stripped.startswith(('INT.', 'EXT.', '.', '#')):
            if not stripped.isupper():  # Not another character name
                dialogue.append({
                    'speaker': current_speaker,
                    'line': stripped,
                })
                current_speaker = None  # Reset after getting dialogue

    return dialogue


def check_voice_distinctiveness(episodes_data):
    """Check for generic dialogue patterns."""
    issues = []

    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        dialogue = extract_dialogue_lines(script)

        for item in dialogue:
            line_lower = item['line'].lower()

            # Check for generic AI patterns
            for pattern in GENERIC_AI_PATTERNS:
                if re.search(pattern, line_lower):
                    issues.append({
                        'episode': ep_num,
                        'type': 'voice',
                        'severity': 'MUST FIX',
                        'speaker': item['speaker'],
                        'line': item['line'][:60],
                        'detail': 'Generic AI dialogue - could be any AI character',
                        'fix_cmd': f'/rewrite [project] ep {ep_num} "{item["speaker"]} dialogue generic—add distinctive voice"',
                    })
                    break

            # Check for generic action patterns
            for pattern in GENERIC_ACTION_PATTERNS:
                if re.search(pattern, line_lower):
                    issues.append({
                        'episode': ep_num,
                        'type': 'voice',
                        'severity': 'COULD IMPROVE',
                        'speaker': item['speaker'],
                        'line': item['line'][:60],
                        'detail': 'Generic action line - could be any character',
                        'fix_cmd': f'/rewrite [project] ep {ep_num} "Generic dialogue—rewrite with character idiom"',
                    })
                    break

    return issues


def check_emotional_register(episodes_data):
    """Check for monotone emotional register across batch."""
    issues = []

    # Simple intensity estimation based on content
    intensities = {}

    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        script_lower = script.lower()

        # High intensity markers
        high_markers = ['fires', 'shoots', 'explosion', 'attack', 'chase',
                       'running', 'fight', 'scream', 'blood', 'death']
        # Low intensity markers
        low_markers = ['quietly', 'softly', 'silence', 'pause', 'beat',
                      'gentle', 'whisper', 'moment', 'peace']

        high_count = sum(1 for m in high_markers if m in script_lower)
        low_count = sum(1 for m in low_markers if m in script_lower)

        # Estimate intensity 1-10
        if high_count > low_count + 2:
            intensity = min(10, 6 + high_count)
        elif low_count > high_count + 2:
            intensity = max(1, 5 - low_count)
        else:
            intensity = 5 + (high_count - low_count)

        intensities[ep_num] = min(10, max(1, intensity))

    # Check for monotone (all within 2 points of each other)
    if len(intensities) >= 3:
        values = list(intensities.values())
        range_val = max(values) - min(values)

        if range_val <= 2:
            issues.append({
                'episode': 'batch',
                'type': 'texture',
                'severity': 'COULD IMPROVE',
                'detail': f'Monotone emotional register across batch (range: {range_val})',
                'intensities': intensities,
                'suggestion': 'Vary one episode significantly up or down',
            })

    return issues


def check_earned_declarations(episodes_data):
    """Check for unearned emotional declarations."""
    issues = []

    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        dialogue = extract_dialogue_lines(script)

        for item in dialogue:
            line_lower = item['line'].lower()

            for phrase, min_ep in EARLY_DECLARATION_PHRASES.items():
                if phrase in line_lower and ep_num < min_ep:
                    issues.append({
                        'episode': ep_num,
                        'type': 'relationship_earning',
                        'severity': 'MUST FIX',
                        'speaker': item['speaker'],
                        'line': item['line'][:60],
                        'detail': f'Declaration too early (found in Ep {ep_num}, appropriate after Ep {min_ep})',
                        'fix_cmd': f'/rewrite [project] ep {ep_num} "Unearned declaration—soften or move later"',
                    })

    return issues


def check_theme_statements(episodes_data):
    """Check for characters directly stating themes."""
    issues = []

    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        dialogue = extract_dialogue_lines(script)

        for item in dialogue:
            line_lower = item['line'].lower()

            for pattern in THEME_STATEMENT_PATTERNS:
                if re.search(pattern, line_lower):
                    issues.append({
                        'episode': ep_num,
                        'type': 'texture',
                        'severity': 'MUST FIX',
                        'speaker': item['speaker'],
                        'line': item['line'][:60],
                        'detail': 'Theme stated directly in dialogue—should be embodied, not announced',
                        'fix_cmd': f'/rewrite [project] ep {ep_num} "Theme stated in dialogue—embody instead"',
                    })
                    break

    return issues


# ---------------------------------------------------------------------------
# LLM-based Cover Test (Scaffolding Gate G3 — Seger)
# Strips dialogue tags, asks Sonnet to identify speakers.
# If accuracy < 80%, dialogue voices have converged.
# ---------------------------------------------------------------------------



def _load_character_names(project_path):
    """Load character names from characters.md for the Cover Test."""
    chars_path = project_path / "bible" / "characters.md"
    if not chars_path.exists():
        chars_path = project_path / "characters.md"
    if not chars_path.exists():
        return []

    content = chars_path.read_text()
    names = []
    for match in re.finditer(r'^##\s+([A-Z][A-Za-z0-9\s\-]+)', content, re.MULTILINE):
        name = match.group(1).strip()
        if name.lower() not in ('voice', 'speech', 'dialogue', 'overview', 'behavioral'):
            names.append(name.upper())
    return names


def check_cover_test(episodes_data, project_path):
    """
    Cover Test (G3 — Seger): Strip dialogue tags, ask LLM to identify speakers.

    If Sonnet can't identify ≥80% of speakers correctly, the voices
    have converged — flag for Script Doctor.

    Returns list of issues. Falls back gracefully if no API key.
    """
    issues = []
    client = get_anthropic_client()
    if client is None:
        return issues  # Skip silently without API

    character_names = _load_character_names(project_path)
    if not character_names:
        return issues

    # Collect all dialogue across the batch
    all_dialogue = []
    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        dialogue = extract_dialogue_lines(script)
        for item in dialogue:
            if item['speaker'] in character_names:
                all_dialogue.append({
                    'ep': ep_num,
                    'speaker': item['speaker'],
                    'line': item['line'],
                })

    if len(all_dialogue) < 5:
        return issues  # Not enough dialogue to test

    # Sample up to 20 lines for the test (cost control)
    sample = random.sample(all_dialogue, min(20, len(all_dialogue)))

    # Build the stripped-tags prompt
    numbered_lines = []
    answer_key = {}
    for i, item in enumerate(sample, 1):
        numbered_lines.append(f"{i}. \"{item['line']}\"")
        answer_key[i] = item['speaker']

    chars_list = ', '.join(sorted(set(character_names)))

    prompt = f"""You are taking a dialogue identification test. Below are lines of dialogue from a microdrama series with the speaker tags REMOVED. Based only on word choice, rhythm, and perspective, identify which character said each line.

Characters in this series: {chars_list}

Dialogue lines (speaker unknown):
{chr(10).join(numbered_lines)}

For each numbered line, output the character name you think said it.
Output EXACTLY in this format (one per line):
1: [CHARACTER NAME]
2: [CHARACTER NAME]
..."""

    try:
        resp = client.messages.create(
            model=ANTHROPIC_SONNET, max_tokens=500,
            messages=[{"role": "user", "content": prompt}],
        )
        result = resp.content[0].text.strip()

        # Parse responses
        correct = 0
        total = 0
        misidentified = []

        for line in result.split('\n'):
            match = re.match(r'(\d+)\s*:\s*(.+)', line.strip())
            if match:
                line_num = int(match.group(1))
                guessed = match.group(2).strip().upper()
                if line_num in answer_key:
                    total += 1
                    actual = answer_key[line_num]
                    if guessed == actual or guessed in actual or actual in guessed:
                        correct += 1
                    else:
                        misidentified.append({
                            'line_num': line_num,
                            'actual': actual,
                            'guessed': guessed,
                            'text': sample[line_num - 1]['line'][:50],
                            'ep': sample[line_num - 1]['ep'],
                        })

        accuracy = (correct / total * 100) if total > 0 else 100

        if accuracy < 80:
            issues.append({
                'episode': 'batch',
                'type': 'voice',
                'severity': 'MUST FIX',
                'detail': f'COVER TEST FAILED: {accuracy:.0f}% speaker accuracy ({correct}/{total}). Voices have converged.',
                'suggestion': f'Misidentified: {", ".join(m["actual"] + " mistaken for " + m["guessed"] for m in misidentified[:3])}',
                'fix_cmd': '/script-doctor [project] --focus voice',
                'cover_test_accuracy': accuracy,
            })
        elif accuracy < 90:
            issues.append({
                'episode': 'batch',
                'type': 'voice',
                'severity': 'COULD IMPROVE',
                'detail': f'Cover Test: {accuracy:.0f}% speaker accuracy ({correct}/{total}). Some voice convergence.',
                'cover_test_accuracy': accuracy,
            })

    except Exception:
        pass  # Fail silently

    return issues


def check_on_the_nose(episodes_data):
    """
    On-the-Nose Diagnostic (R4 — Seger): Detect characters directly
    stating emotions or themes.

    Uses regex as fast pre-filter, then Haiku for uncertain cases.
    Flags for Script Doctor Phase 2 rewrite into action/subtext.

    Returns list of issues.
    """
    issues = []
    client = get_anthropic_client()

    # First pass: regex (existing theme patterns + new emotion patterns)
    emotion_patterns = [
        r'i(?:\'m| am) (?:so |really |truly )?(?:angry|scared|afraid|sad|happy|hurt|betrayed|lonely|broken)',
        r'(?:you|he|she|they) make[s]? me (?:feel |so )?(?:angry|scared|afraid|sad|happy|hurt)',
        r'i feel (?:so |really |truly )?(?:lost|alone|empty|broken|free|alive|dead inside)',
        r'this (?:is|feels) (?:just )?like (?:losing|finding|being)',
    ]

    for ep_num, content in episodes_data.items():
        script = extract_script_content(content)
        dialogue = extract_dialogue_lines(script)

        for item in dialogue:
            line_lower = item['line'].lower()

            # Check emotion statements
            for pattern in emotion_patterns:
                if re.search(pattern, line_lower):
                    issues.append({
                        'episode': ep_num,
                        'type': 'on_the_nose',
                        'severity': 'MUST FIX',
                        'speaker': item['speaker'],
                        'line': item['line'][:60],
                        'detail': 'Character directly states emotional state — rewrite into action or subtext',
                        'fix_cmd': f'/rewrite [project] ep {ep_num} "On-the-nose dialogue—translate to action/subtext"',
                    })
                    break

    # Second pass: LLM for deeper on-the-nose detection (batch level)
    if client and len(issues) < 5:
        # Only run LLM pass if regex didn't already catch many issues
        all_dialogue_text = []
        for ep_num, content in episodes_data.items():
            script = extract_script_content(content)
            dialogue = extract_dialogue_lines(script)
            for item in dialogue:
                all_dialogue_text.append(f"Ep{ep_num} {item['speaker']}: {item['line']}")

        if len(all_dialogue_text) >= 3:
            dialogue_block = '\n'.join(all_dialogue_text[:40])  # Cap at 40 lines

            prompt = f"""Review this dialogue for on-the-nose writing. Flag lines where characters directly STATE their emotions, announce the theme, or explain what they're feeling instead of showing it through action or subtext.

{dialogue_block}

List ONLY the on-the-nose lines (max 5). For each, output:
LINE: [the exact dialogue]
PROBLEM: [why it's on-the-nose, in 5 words]

If no on-the-nose lines found, output: NONE"""

            try:
                resp = client.messages.create(
                    model=ANTHROPIC_HAIKU, max_tokens=400,
                    messages=[{"role": "user", "content": prompt}],
                )
                result = resp.content[0].text.strip()

                if "NONE" not in result.upper():
                    # Parse flagged lines
                    for match in re.finditer(r'LINE:\s*(.+?)(?:\n|$)', result):
                        flagged_line = match.group(1).strip().strip('"\'')
                        # Find which episode/speaker this belongs to
                        for ep_num, content in episodes_data.items():
                            script = extract_script_content(content)
                            if flagged_line[:30].lower() in script.lower():
                                issues.append({
                                    'episode': ep_num,
                                    'type': 'on_the_nose',
                                    'severity': 'COULD IMPROVE',
                                    'line': flagged_line[:60],
                                    'detail': 'LLM detected on-the-nose dialogue — consider rewriting into subtext',
                                    'fix_cmd': f'/rewrite [project] ep {ep_num} "On-the-nose—translate to subtext"',
                                })
                                break
            except Exception:
                pass

    return issues


def load_episodes(project_path, batch_num=None, ep_num=None, full=False):
    """Load episode content based on mode."""
    episodes_dir = project_path / "episodes"
    episodes_data = {}

    if not episodes_dir.exists():
        return episodes_data

    if batch_num:
        ep_start = (batch_num - 1) * GENERATION_BATCH_SIZE + 1
        ep_end = batch_num * GENERATION_BATCH_SIZE
        ep_range = range(ep_start, ep_end + 1)
    elif ep_num:
        ep_range = [ep_num]
    elif full:
        # Find all episodes
        ep_files = sorted(episodes_dir.glob("ep_*.md"))
        ep_range = []
        for f in ep_files:
            match = re.search(r'ep_(\d+)', f.name)
            if match:
                ep_range.append(int(match.group(1)))
    else:
        return episodes_data

    for ep in ep_range:
        ep_file = episodes_dir / f"ep_{ep:03d}.md"
        if ep_file.exists():
            episodes_data[ep] = ep_file.read_text()

    return episodes_data


def main():
    if len(sys.argv) < 3:
        print("Usage: python3 dramatic_qc_gate.py <project_path> --batch <N>")
        print("       python3 dramatic_qc_gate.py <project_path> --ep <N>")
        print("       python3 dramatic_qc_gate.py <project_path> --full")
        sys.exit(2)

    project_path = Path(sys.argv[1]).resolve()

    if not project_path.exists():
        print(f"Error: Project path does not exist: {project_path}")
        sys.exit(2)

    # Parse mode
    batch_num = None
    ep_num = None
    full = False

    args = sys.argv[2:]
    i = 0
    while i < len(args):
        if args[i] == '--batch' and i + 1 < len(args):
            batch_num = int(args[i + 1])
            i += 2
        elif args[i] == '--ep' and i + 1 < len(args):
            ep_num = int(args[i + 1])
            i += 2
        elif args[i] == '--full':
            full = True
            i += 1
        else:
            i += 1

    # Load episodes
    episodes_data = load_episodes(project_path, batch_num, ep_num, full)

    if not episodes_data:
        print(f"Error: No episodes found")
        sys.exit(2)

    # Run checks
    all_issues = []
    all_issues.extend(check_voice_distinctiveness(episodes_data))
    all_issues.extend(check_cover_test(episodes_data, project_path))
    all_issues.extend(check_emotional_register(episodes_data))
    all_issues.extend(check_earned_declarations(episodes_data))
    all_issues.extend(check_theme_statements(episodes_data))
    all_issues.extend(check_on_the_nose(episodes_data))

    # Categorize
    must_fix = [i for i in all_issues if i['severity'] == 'MUST FIX']
    could_improve = [i for i in all_issues if i['severity'] == 'COULD IMPROVE']

    # Report
    if batch_num:
        scope = f"Batch {batch_num} (Ep {(batch_num-1)*GENERATION_BATCH_SIZE+1}-{batch_num*GENERATION_BATCH_SIZE})"
    elif ep_num:
        scope = f"Episode {ep_num}"
    else:
        scope = f"Full Series ({len(episodes_data)} episodes)"

    print(f"\n{'='*60}")
    print(f"DRAMATIC QC: {scope}")
    print(f"Project: {project_path.name}")
    print(f"{'='*60}")

    print(f"\nMUST FIX: {len(must_fix)} issues")
    print(f"COULD IMPROVE: {len(could_improve)} issues")

    if must_fix:
        print(f"\n{'-'*60}")
        print("MUST FIX")
        print(f"{'-'*60}")
        for issue in must_fix:
            print(f"\n[{issue['type'].upper()}] Ep {issue['episode']}: {issue.get('speaker', '')}")
            if 'line' in issue:
                print(f"  Line: \"{issue['line']}\"")
            print(f"  Problem: {issue['detail']}")
            if 'fix_cmd' in issue:
                print(f"  {issue['fix_cmd']}")

    if could_improve:
        print(f"\n{'-'*60}")
        print("COULD IMPROVE")
        print(f"{'-'*60}")
        for issue in could_improve:
            print(f"\n[{issue['type'].upper()}] Ep {issue['episode']}: {issue.get('speaker', '')}")
            if 'line' in issue:
                print(f"  Line: \"{issue['line']}\"")
            print(f"  {issue['detail']}")
            if 'suggestion' in issue:
                print(f"  Suggestion: {issue['suggestion']}")
            if 'fix_cmd' in issue:
                print(f"  {issue['fix_cmd']}")

    # Result
    print(f"\n{'='*60}")

    if must_fix:
        print(f"DRAMATIC QC: {len(must_fix)} MUST FIX issues found")
        print(f"\nRun the /rewrite commands above to address issues.")
        print(f"{'='*60}\n")
        sys.exit(1)  # Soft fail - reports but doesn't block
    else:
        print(f"DRAMATIC QC: PASSED (no critical issues)")
        if could_improve:
            print(f"Consider addressing the {len(could_improve)} COULD IMPROVE items.")
        print(f"{'='*60}\n")
        sys.exit(0)


if __name__ == "__main__":
    main()
