#!/usr/bin/env python3
"""
validate_docs.py - Documentation Consistency Checker

Ensures all documentation files are consistent with CONSTANTS.md.
Run this after any change to CONSTANTS.md or when editing documentation.

Usage:
    python3 validate_docs.py                    # Check all docs
    python3 validate_docs.py --fix-refs         # Show files missing CONSTANTS.md reference
    python3 validate_docs.py --verbose          # Show all checks, not just failures
"""

import os
import re
import sys
from pathlib import Path
from collections import defaultdict

# Constants we check for (from CONSTANTS.md)
CANONICAL_VALUES = {
    # Word counts
    'word_count': {'pattern': r'\b450-500\b', 'value': '450-500', 'category': 'word_count'},

    # Percentages - ratios
    'dialogue_max': {'pattern': r'(?:≤|<=?)?\s*40\s*%', 'value': '40%', 'category': 'percentage'},
    'hook_silent': {'pattern': r'80\s*%\s*(?:silent|SILENT)', 'value': '80% silent', 'category': 'pattern_ratio'},
    'hook_dialogue': {'pattern': r'20\s*%\s*(?:dialogue|DIALOGUE)', 'value': '20% dialogue', 'category': 'pattern_ratio'},
    'cliffhanger_midaction': {'pattern': r'80\s*%\s*(?:mid-?action|MID-?ACTION)', 'value': '80% mid-action', 'category': 'pattern_ratio'},
    'cliffhanger_aftermath': {'pattern': r'20\s*%\s*(?:aftermath|AFTERMATH)', 'value': '20% aftermath', 'category': 'pattern_ratio'},

    # Treatment ratios (70-85% range)
    'treatment_hook_ratio': {'pattern': r'70-85\s*%\s*(?:silent|SILENT)', 'value': '70-85% SILENT', 'category': 'treatment_ratio'},
    'treatment_cliff_ratio': {'pattern': r'70-85\s*%\s*(?:mid-?action|MID-?ACTION)', 'value': '70-85% MID-ACTION', 'category': 'treatment_ratio'},

    # Limits
    'max_exchanges': {'pattern': r'(?:max(?:imum)?|≤)\s*8\s*(?:exchanges)?', 'value': '8 exchanges', 'category': 'limit'},
    'max_consecutive': {'pattern': r'max\s*3\s*(?:consecutive|allowed)', 'value': 'max 3 consecutive', 'category': 'limit'},
    'consecutive_violation': {'pattern': r'4\+\s*(?:is\s*)?(?:violation|triggers?)', 'value': '4+ is violation', 'category': 'limit'},

    # Episode counts
    'total_episodes': {'pattern': r'\b60\s*(?:episodes?|eps?)\b', 'value': '60 episodes', 'category': 'structure'},
    'batch_size': {'pattern': r'\b5\s*(?:episodes?\s*per\s*batch|eps?\s*per)', 'value': '5 per batch', 'category': 'structure'},
    'pilot_count': {'pattern': r'(?:first|pilot)\s*10\s*(?:episodes?|eps?)', 'value': 'first 10', 'category': 'structure'},

    # Structural beats
    'plot_point_1': {'pattern': r'(?:ep(?:isode)?\.?\s*)?15.*(?:plot\s*point|threshold|lock-?in)', 'value': 'Ep 15', 'category': 'beat'},
    'midpoint': {'pattern': r'(?:ep(?:isode)?\.?\s*)?30.*midpoint', 'value': 'Ep 30', 'category': 'beat'},
    'all_is_lost': {'pattern': r'(?:ep(?:isode)?\.?\s*)?45.*(?:all\s*is\s*lost|defeat)', 'value': 'Ep 45', 'category': 'beat'},
}

# Known outdated values that should be flagged
OUTDATED_VALUES = [
    {'pattern': r'\b250-315\b', 'replacement': '450-500 (or see CONSTANTS.md)', 'description': 'old word count'},
    {'pattern': r'\b280-350\b', 'replacement': '450-500 (see CONSTANTS.md)', 'description': 'old validation range'},
    {'pattern': r'\b265\s*words?\b', 'replacement': '450-500 (see CONSTANTS.md)', 'description': 'old word target'},
    {'pattern': r'\b350-400\b', 'replacement': '450-500 (see CONSTANTS.md)', 'description': 'old word count (v12)'},
    {'pattern': r'\b330-420\b', 'replacement': '450-500 (see CONSTANTS.md)', 'description': 'old validation range (v12)'},
    {'pattern': r'The\s+Lens\b(?!\s*\()', 'replacement': 'Archetype-Worldview', 'description': 'renamed concept'},
    {'pattern': r'(?:no|never)\s*3\+\s*(?:same\s*type\s*)?consecutive', 'replacement': 'max 3 consecutive; 4+ is violation', 'description': 'wrong consecutive rule (should be max 3 allowed, not no 3+)'},
    {'pattern': r'\b5000-6000\b', 'replacement': '3000-4000 (see CONSTANTS.md)', 'description': 'wrong treatment total word count'},
]

# Files that should reference CONSTANTS.md
SHOULD_REFERENCE_CONSTANTS = [
    'format_v12/SKILL.md',
    'treatment/SKILL.md',
    'relationship_earning/SKILL.md',
    'dramatic_elements/SKILL.md',
    'orchestration_process/SKILL.md',
    'appendix_a_cliffhangers_hooks.md',
    'appendix_b_variety.md',
    'appendix_c_emotion.md',
    'appendix_d_ai_video.md',
    'SCRIPTING_REQUIREMENTS.md',
]


def find_engine_path():
    """Find the engine root directory (recoil/)."""
    script_dir = Path(__file__).parent
    if script_dir.name == 'tools':
        return script_dir.parent
    # Try to find from current directory
    cwd = Path.cwd()
    if (cwd / 'tools').is_dir() and (cwd / 'editors').is_dir():
        return cwd
    raise FileNotFoundError("Could not find engine root directory")


def get_all_md_files(engine_path):
    """Get all markdown files in the engine directory."""
    md_files = []
    for root, dirs, files in os.walk(engine_path):
        # Skip archive directory
        if 'archive' in root:
            continue
        for file in files:
            if file.endswith('.md'):
                md_files.append(Path(root) / file)
    return md_files


def check_constants_reference(file_path, content):
    """Check if file references CONSTANTS.md when it should."""
    rel_path = str(file_path).split('')[-1] if '' in str(file_path) else file_path.name

    should_reference = any(ref in rel_path for ref in SHOULD_REFERENCE_CONSTANTS)
    has_reference = 'CONSTANTS.md' in content

    return should_reference, has_reference


def check_outdated_values(file_path, content):
    """Check for outdated values that should be updated."""
    issues = []
    for outdated in OUTDATED_VALUES:
        matches = re.finditer(outdated['pattern'], content, re.IGNORECASE)
        for match in matches:
            # Get line number
            line_num = content[:match.start()].count('\n') + 1
            issues.append({
                'file': file_path,
                'line': line_num,
                'found': match.group(),
                'replacement': outdated['replacement'],
                'description': outdated['description'],
            })
    return issues


def check_value_consistency(file_path, content):
    """Check for values that should match CONSTANTS.md."""
    # This is informational - we report where canonical values appear
    appearances = defaultdict(list)

    for name, info in CANONICAL_VALUES.items():
        matches = re.finditer(info['pattern'], content, re.IGNORECASE)
        for match in matches:
            line_num = content[:match.start()].count('\n') + 1
            appearances[info['category']].append({
                'constant': name,
                'file': file_path,
                'line': line_num,
                'value': info['value'],
            })

    return appearances


def check_terminology(file_path, content):
    """Check for terminology issues."""
    issues = []

    # Check for "The Lens" (should be "Archetype-Worldview")
    # But allow "The Lens" in historical context or in lenses/ directory
    if 'lenses/' not in str(file_path) and 'archive/' not in str(file_path):
        lens_matches = re.finditer(r'The\s+Lens\b(?!\s*\(assessment)', content, re.IGNORECASE)
        for match in lens_matches:
            line_num = content[:match.start()].count('\n') + 1
            # Check context - is it referring to the super-competence concept?
            context_start = max(0, match.start() - 50)
            context_end = min(len(content), match.end() + 50)
            context = content[context_start:context_end]

            if any(word in context.lower() for word in ['technopath', 'tactician', 'high-roller', 'survivor', 'super-competence', 'competence']):
                issues.append({
                    'file': file_path,
                    'line': line_num,
                    'issue': '"The Lens" should be "Archetype-Worldview"',
                    'context': context.strip(),
                })

    return issues


def main():
    verbose = '--verbose' in sys.argv
    fix_refs = '--fix-refs' in sys.argv

    try:
        engine_path = find_engine_path()
    except FileNotFoundError as e:
        print(f"Error: {e}")
        sys.exit(1)

    print("=" * 70)
    print("DOCUMENTATION CONSISTENCY CHECK")
    print(f"Engine path: {engine_path}")
    print("=" * 70)

    md_files = get_all_md_files(engine_path)
    print(f"\nScanning {len(md_files)} markdown files...\n")

    all_outdated = []
    all_terminology = []
    missing_references = []
    value_map = defaultdict(list)

    for file_path in md_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f"Warning: Could not read {file_path}: {e}")
            continue

        # Check CONSTANTS.md reference
        should_ref, has_ref = check_constants_reference(file_path, content)
        if should_ref and not has_ref:
            missing_references.append(file_path)

        # Check outdated values
        outdated = check_outdated_values(file_path, content)
        all_outdated.extend(outdated)

        # Check terminology
        terminology = check_terminology(file_path, content)
        all_terminology.extend(terminology)

        # Track value appearances
        appearances = check_value_consistency(file_path, content)
        for category, items in appearances.items():
            value_map[category].extend(items)

    # Report results
    has_issues = False

    # Outdated values
    if all_outdated:
        has_issues = True
        print("\n" + "=" * 70)
        print("OUTDATED VALUES FOUND")
        print("=" * 70)
        for issue in all_outdated:
            rel_path = str(issue['file']).split('')[-1]
            print(f"\n  {rel_path}:{issue['line']}")
            print(f"    Found: {issue['found']}")
            print(f"    Should be: {issue['replacement']}")
            print(f"    ({issue['description']})")

    # Terminology issues
    if all_terminology:
        has_issues = True
        print("\n" + "=" * 70)
        print("TERMINOLOGY ISSUES")
        print("=" * 70)
        for issue in all_terminology:
            rel_path = str(issue['file']).split('')[-1]
            print(f"\n  {rel_path}:{issue['line']}")
            print(f"    Issue: {issue['issue']}")

    # Missing CONSTANTS.md references
    if missing_references:
        print("\n" + "=" * 70)
        print("FILES MISSING CONSTANTS.md REFERENCE")
        print("=" * 70)
        print("\nThese files should reference CONSTANTS.md:")
        for f in missing_references:
            rel_path = str(f).split('')[-1]
            print(f"  - {rel_path}")
        if fix_refs:
            print("\nTo fix, add this line near the top of each file:")
            print('  > **Numeric values:** See `/CONSTANTS.md`')

    # Value distribution (verbose only)
    if verbose:
        print("\n" + "=" * 70)
        print("VALUE DISTRIBUTION (where canonical values appear)")
        print("=" * 70)
        for category, items in sorted(value_map.items()):
            print(f"\n{category.upper()}:")
            for item in items[:10]:  # Limit to first 10 per category
                rel_path = str(item['file']).split('')[-1]
                print(f"  {item['value']}: {rel_path}:{item['line']}")
            if len(items) > 10:
                print(f"  ... and {len(items) - 10} more")

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"Files scanned: {len(md_files)}")
    print(f"Outdated values: {len(all_outdated)}")
    print(f"Terminology issues: {len(all_terminology)}")
    print(f"Missing CONSTANTS.md refs: {len(missing_references)}")

    if has_issues:
        print("\n" + "=" * 70)
        print("DOCUMENTATION CHECK: ISSUES FOUND")
        print("Fix the issues above, then re-run this script.")
        print("=" * 70 + "\n")
        sys.exit(1)
    else:
        print("\n" + "=" * 70)
        print("DOCUMENTATION CHECK: PASSED")
        print("All documents are consistent with CONSTANTS.md")
        print("=" * 70 + "\n")
        sys.exit(0)


if __name__ == "__main__":
    main()