#!/usr/bin/env python3
"""Build + drift-check the Recoil engine topology SSOT.

Authored sources:  recoil/architecture/topology/nodes/*.topo.yaml  (validated by topology.schema.json)
Generated outputs: recoil/architecture/topology/generated/{topology.full.json, TOPOLOGY_REDUCED.md,
                   topology.engineering.md, topology.investor.md, topology.drift.json}

Generated files are DETERMINISTIC functions of the authored inputs + ssot_manifest.yaml ONLY
(no timestamps / no commit sha) so `--check` can diff them reproducibly.

Usage:
    python3 recoil/architecture/topology/tools/build_topology.py --write   # regenerate generated/
    python3 recoil/architecture/topology/tools/build_topology.py --check   # CI: fail on drift / broken refs

Format SSOT: consultations/recoil/engine-topology-ssot-2026-06-19/SYNTHESIS.md
"""
from __future__ import annotations

import argparse
import ast
import json
import re
import sys
from pathlib import Path

import yaml

TOPO_DIR = Path(__file__).resolve().parent.parent          # recoil/architecture/topology
REPO_ROOT = TOPO_DIR.parent.parent.parent                  # repo root
NODES_DIR = TOPO_DIR / "nodes"
GEN_DIR = TOPO_DIR / "generated"
SCHEMA_PATH = TOPO_DIR / "topology.schema.json"
MANIFEST_PATH = REPO_ROOT / "recoil/architecture/ssot_manifest.yaml"
UNMODELED_ALLOWLIST_PATH = REPO_ROOT / "recoil/architecture/topology/unmodeled_surfaces_allowlist.txt"

NODE_TYPES = ("capabilities", "entrypoints", "phases", "routes", "loops",
              "flags", "artifacts", "divergences", "symbols", "schema")
REDUCED_TOKEN_CAP = 7500  # token proxy = len(text)//4 — JT "build as measured": every divergence carries its
# invariant + full entrypoint/loop/flag indexes + the schema dup-cluster block uncompressed; still a small
# fraction of a session load.

# symref-bearing fields per node type (checked to exist on disk in --check)
SYMREF_FIELDS = {
    "entrypoints": ["file"], "routes": ["entry", "resolves_via"],
    "loops": ["driver"], "flags": ["defined"], "symbols": ["file"],
    "schema": ["file"],
}


class BuildError(Exception):
    pass


def _load_yaml(p: Path):
    with open(p) as fh:
        return yaml.safe_load(fh) or {}


def _validate(doc: dict, schema: dict, fname: str, errors: list[str]) -> None:
    """JSON-Schema validation — FAIL-CLOSED (Codex MAJOR). jsonschema is a hard requirement
    (recoil/requirements.txt); if it is unavailable we error rather than silently degrade to a
    weaker check, so invalid topology can never pass in a clean venv. The manual fallback below
    runs ONLY as a belt-and-suspenders second pass when jsonschema is present-but-failed-import."""
    try:
        import jsonschema
        v = jsonschema.Draft7Validator(schema)
        for e in sorted(v.iter_errors(doc), key=lambda e: e.path):
            errors.append(f"{fname}: schema: {'/'.join(map(str, e.path))}: {e.message}")
        return
    except ImportError:
        errors.append(f"{fname}: jsonschema not installed — REQUIRED for fail-closed topology "
                      "validation (pip install jsonschema / add to recoil/requirements.txt)")
    # Defensive fallback (only reached on the error path above): required + unknown-field check.
    defs = schema["$defs"]
    type_to_def = {"capabilities": "capability", "entrypoints": "entrypoint",
                   "phases": "phase", "routes": "route", "loops": "loop",
                   "flags": "flag", "artifacts": "artifact",
                   "divergences": "divergence", "symbols": "symbol"}
    for key in doc:
        if key not in ("subsystem", "out_of_scope_capabilities", *NODE_TYPES):
            errors.append(f"{fname}: unknown top-level key '{key}'")
    for ntype, dname in type_to_def.items():
        spec = defs[dname]
        req = spec.get("required", [])
        allowed = set(spec["properties"])
        for i, item in enumerate(doc.get(ntype) or []):
            if not isinstance(item, dict):
                errors.append(f"{fname}: {ntype}[{i}] not a mapping")
                continue
            for r in req:
                if r not in item:
                    errors.append(f"{fname}: {ntype}[{i}] missing required '{r}'")
            for k in item:
                if k not in allowed:
                    errors.append(f"{fname}: {ntype}[{i}] unknown field '{k}'")


def load_graph() -> tuple[dict, list[str]]:
    """Load + validate all node files into a merged graph. Returns (graph, errors)."""
    schema = json.loads(SCHEMA_PATH.read_text())
    errors: list[str] = []
    graph: dict[str, list] = {t: [] for t in NODE_TYPES}
    out_of_scope: list[str] = []
    seen_ids: dict[str, str] = {}
    for f in sorted(NODES_DIR.glob("*.topo.yaml")):
        doc = _load_yaml(f)
        _validate(doc, schema, f.name, errors)
        out_of_scope.extend(doc.get("out_of_scope_capabilities") or [])
        for ntype in NODE_TYPES:
            for item in (doc.get(ntype) or []):
                nid = item.get("id")
                if nid in seen_ids:
                    errors.append(f"{f.name}: duplicate id '{nid}' (also in {seen_ids[nid]})")
                seen_ids[nid] = f.name
                graph[ntype].append(item)
    graph["out_of_scope_capabilities"] = sorted(set(out_of_scope))
    return graph, errors


def manifest_capabilities() -> dict:
    return (_load_yaml(MANIFEST_PATH).get("capabilities") or {})


def check_fk(graph: dict, errors: list[str]) -> None:
    caps = manifest_capabilities()
    referenced = {c["id"] for c in graph["capabilities"]}
    out_of_scope = set(graph.get("out_of_scope_capabilities") or [])
    for cid in referenced:
        if cid not in caps:
            errors.append(f"capability FK '{cid}' not in ssot_manifest.yaml")
    # MANIFEST PARITY (Codex CRITICAL): every manifest capability must be EITHER mapped
    # as a topology node OR explicitly declared out_of_scope — no silent gaps.
    for cid in caps:
        if cid not in referenced and cid not in out_of_scope:
            errors.append(f"manifest capability '{cid}' has NO topology node and is not in "
                          f"out_of_scope_capabilities (map it or declare it out-of-scope)")
    for cid in out_of_scope:
        if cid not in caps:
            errors.append(f"out_of_scope_capabilities lists '{cid}' which is not a manifest capability")
        if cid in referenced:
            errors.append(f"capability '{cid}' is BOTH mapped and out_of_scope — pick one")
    # EVERY capability-bearing field must reference a declared capability node (Codex MAJOR):
    # routes/divergences (capability|forks_capability), flags (forks_capability), loops (capability),
    # entrypoints (capabilities[] list).
    for ntype in ("routes", "divergences", "flags"):
        for item in graph[ntype]:
            cap = item.get("capability") or item.get("forks_capability")
            if cap and cap not in referenced:
                errors.append(f"{ntype} '{item['id']}' references capability '{cap}' "
                              f"not declared in any _capabilities/*.topo.yaml")
    for lp in graph["loops"]:
        cap = lp.get("capability")
        if cap and cap not in referenced:
            errors.append(f"loop '{lp['id']}'.capability -> '{cap}' is not a declared capability node")
    for e in graph["entrypoints"]:
        for cap in (e.get("capabilities") or []):
            if cap not in referenced:
                errors.append(f"entrypoint '{e['id']}'.capabilities -> '{cap}' is not a declared capability node")


def _symbol_in(body: str, name: str) -> bool:
    """True if `name` is defined in `body` as a def/class, a module/class assignment, or an
    annotated (dataclass) field — allows indentation so methods + dataclass attrs match."""
    return bool(re.search(
        rf"(def|class)\s+{re.escape(name)}\b|^\s*{re.escape(name)}\s*[:=]", body, re.M))


def _defines(node: ast.AST, name: str) -> bool:
    """True if this AST node defines `name` — a def/class, a Name= / Name: assignment, or a
    `self.name = ` / `name: T` attribute (dataclass field or constructor attr)."""
    if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
        return node.name == name
    if isinstance(node, ast.AnnAssign):
        t = node.target
        return (isinstance(t, ast.Name) and t.id == name) or (isinstance(t, ast.Attribute) and t.attr == name)
    if isinstance(node, ast.Assign):
        return any((isinstance(t, ast.Name) and t.id == name)
                   or (isinstance(t, ast.Attribute) and t.attr == name) for t in node.targets)
    return False


def _member_of(cls_node: ast.ClassDef, member: str) -> bool:
    """True if `member` is a method / nested class / field / `self.member` of this class."""
    return any(_defines(n, member) for n in ast.walk(cls_node))


def _symbol_exists(body: str, sym: str) -> bool:
    """AST symbol existence. Bare name → defined ANYWHERE (module, class body, or self-attr;
    a method referenced unqualified is fine). `Class.method` (incl. `module.Class.method`) →
    `method` must be a member of class `Class` specifically (Codex MAJOR: beats the regex that
    let a qualified ref pass when the method lived on a DIFFERENT class). Falls back to the regex
    matcher only on a SyntaxError."""
    try:
        tree = ast.parse(body)
    except SyntaxError:
        return all(_symbol_in(body, n) for n in {sym.split(".")[0], sym.split(".")[-1]})
    parts = sym.split(".")
    if len(parts) == 1:
        return any(_defines(n, parts[0]) for n in ast.walk(tree))
    cls, member = parts[-2], parts[-1]
    return any(isinstance(n, ast.ClassDef) and n.name == cls and _member_of(n, member)
               for n in ast.walk(tree))


def check_gates(graph: dict, errors: list[str]) -> None:
    """Codex MAJOR: every route `gated_by` must resolve to a real flag node, and every
    `flag.gates_routes[]` to a real route — so unresolved gates can't ship silently."""
    flag_ids = {f["id"] for f in graph["flags"]}
    route_ids = {r["id"] for r in graph["routes"]}
    for r in graph["routes"]:
        g = r.get("gated_by")
        if g and g not in flag_ids:
            errors.append(f"route '{r['id']}'.gated_by -> '{g}' is not an existing flag node")
    for f in graph["flags"]:
        for rid in (f.get("gates_routes") or []):
            if rid not in route_ids:
                errors.append(f"flag '{f['id']}'.gates_routes -> '{rid}' is not an existing route node")
    # divergence.on_flag must resolve to a real flag node (Codex MAJOR) — a typo'd gate must not
    # render into the reduced tier as if real.
    for d in graph["divergences"]:
        of = d.get("on_flag")
        if of and of not in flag_ids:
            errors.append(f"divergence '{d['id']}'.on_flag -> '{of}' is not an existing flag node")


def check_symrefs(graph: dict, errors: list[str]) -> None:
    for ntype, fields in SYMREF_FIELDS.items():
        for item in graph[ntype]:
            for field in fields:
                ref = item.get(field)
                if not ref:
                    continue
                path_part, _, sym = ref.partition("::")
                fpath = REPO_ROOT / path_part
                if not fpath.is_file():
                    errors.append(f"{ntype} '{item['id']}'.{field}: file not found: {path_part}")
                    continue
                if sym or ntype == "schema":
                    body = fpath.read_text(errors="ignore")
                if sym:
                    if not _symbol_exists(body, sym):
                        errors.append(f"{ntype} '{item['id']}'.{field}: symbol '{sym}' "
                                      f"not found (AST) in {path_part}")
                if ntype == "schema":
                    cls = sym or item.get("model")
                    if not cls:
                        errors.append(f"{ntype} '{item['id']}'.{field}: missing Class anchor in {path_part}")
                        continue
                    for schema_field in item.get("fields") or []:
                        field_name = schema_field.get("name")
                        if field_name and not _symbol_exists(body, f"{cls}.{field_name}"):
                            errors.append(f"{ntype} '{item['id']}'.fields: symbol "
                                          f"'{cls}.{field_name}' not found (AST) in {path_part}")
    # Separate `file` + `symbol` fields (symbol nodes + entrypoints carry them apart, not as
    # `file::sym`). Codex MAJOR: validate the symbol exists in its file, not just the file.
    for ntype in ("symbols", "entrypoints"):
        for item in graph[ntype]:
            fpart = (item.get("file") or "").split("::")[0]
            sym = item.get("symbol")
            if not fpart or not sym:
                continue
            fpath = REPO_ROOT / fpart
            if not fpath.is_file():
                errors.append(f"{ntype} '{item['id']}'.file not found: {fpart}")
                continue
            body = fpath.read_text(errors="ignore")
            # `__main__` is the script-execution convention (run via `python file.py`); valid
            # iff the file actually carries an `if __name__ == "__main__"` guard.
            if sym == "__main__":
                if '__main__' not in body:
                    errors.append(f"{ntype} '{item['id']}'.symbol '__main__' but {fpart} has no "
                                  f"`if __name__ == \"__main__\"` guard")
                continue
            if not _symbol_exists(body, sym):
                errors.append(f"{ntype} '{item['id']}'.symbol '{sym}' not found (AST) in {fpart}")


def _cell_key(r: dict) -> tuple:
    """Identity of a rendered divergence cell — two forks that render identically
    are a non-fork (A==B) and must be rejected unless the divergence opts in."""
    return (r.get("surface"), r.get("ref_kind"), (r.get("resolves_via") or "").split("::")[-1])


def check_route_refs(graph: dict, errors: list[str]) -> None:
    """Every route-referencing field must resolve to a real route node (Codex MAJOR:
    a dangling `fallback_route` shipped + generated into full.json). Scans `fallback_route`
    on routes + `retries_into`/`reroll_into` on loops."""
    route_ids = {r["id"] for r in graph["routes"]}
    # fallback_route MUST be a route. retries_into/reroll_into re-enter the thing they re-run —
    # a route, a sibling loop (escalation chain), or an entrypoint (e.g. re-invoke the board build).
    rerun_ids = route_ids | {lp["id"] for lp in graph["loops"]} | {e["id"] for e in graph["entrypoints"]}
    for r in graph.get("routes", []):
        ref = r.get("fallback_route")
        if ref and ref not in route_ids:
            errors.append(f"route '{r['id']}'.fallback_route -> '{ref}' is not an existing route node")
    for lp in graph.get("loops", []):
        for field in ("retries_into", "reroll_into"):
            ref = lp.get(field)
            if ref and ref not in rerun_ids:
                errors.append(f"loop '{lp['id']}'.{field} -> '{ref}' is not an existing route/loop/entrypoint node")


def check_phases(graph: dict, errors: list[str]) -> None:
    """Phase spine FKs must resolve: entrypoints[] -> entrypoint nodes, produces[] -> artifact nodes,
    next[] -> phase nodes (so a dangling spine link can't ship)."""
    ep_ids = {e["id"] for e in graph["entrypoints"]}
    art_ids = {a["id"] for a in graph["artifacts"]}
    ph_ids = {p["id"] for p in graph["phases"]}
    for p in graph.get("phases", []):
        for e in p.get("entrypoints", []):
            if e not in ep_ids:
                errors.append(f"phase '{p['id']}'.entrypoints -> '{e}' is not an existing entrypoint node")
        for a in p.get("produces", []):
            if a not in art_ids:
                errors.append(f"phase '{p['id']}'.produces -> '{a}' is not an existing artifact node")
        for n in p.get("next", []):
            if n not in ph_ids:
                errors.append(f"phase '{p['id']}'.next -> '{n}' is not an existing phase node")


def check_divergences(graph: dict, errors: list[str]) -> None:
    """Accuracy gate (Codex review P0 #1): a divergence must fork into >=2 REAL route
    nodes that render distinctly. Rejects placeholder `?` cells, A==B non-forks, and
    forks pointing at non-route ids (artifacts etc.)."""
    route_ids = {r["id"] for r in graph["routes"]}
    for d in graph["divergences"]:
        forks = d.get("forks", [])
        if len(forks) < 2:
            errors.append(f"divergence '{d['id']}' has <2 forks ({forks})")
            continue
        rs = []
        for fid in forks:
            if fid not in route_ids:
                errors.append(f"divergence '{d['id']}' fork '{fid}' is not a route node "
                              f"(forks must reference route ids, not artifacts/other)")
                continue
            rs.append(_route(graph, fid))
        if len(rs) != len(forks):
            continue  # already reported a non-route fork; skip A==B until fixed
        keys = [_cell_key(r) for r in rs]
        if not d.get("allow_same") and len(keys) != len(set(keys)):
            errors.append(f"divergence '{d['id']}' renders identical fork routes (A==B): "
                          f"{keys} — fix the routes or set allow_same:true with a reason")


def check_schema_dups(graph: dict, errors: list[str]) -> dict[str, list[str]]:
    """Return semantic-tag clusters across schema model fields.

    Contract: malformed schema field nodes append validation errors here, but duplicate
    semantic tags do not. A multi-field semantic cluster is intentionally queryable
    topology data because legitimate overlaps exist across Bible models.
    """
    clusters: dict[str, list[str]] = {}
    for node in graph.get("schema", []):
        model = node.get("model") or node.get("id") or "?"
        for field in node.get("fields") or []:
            name = field.get("name")
            if not name:
                errors.append(f"schema '{node.get('id', '?')}' has field missing name")
                continue
            if "semantic" not in field:
                errors.append(f"schema '{node.get('id', '?')}'.{name} missing semantic tag")
                continue
            semantic = field.get("semantic")
            if semantic:
                clusters.setdefault(semantic, []).append(f"{model}.{name}")
    return {tag: sorted(refs) for tag, refs in sorted(clusters.items())}


def _validate_graph(graph: dict) -> list[str]:
    errors: list[str] = []
    check_fk(graph, errors)
    check_symrefs(graph, errors)
    check_route_refs(graph, errors)
    check_gates(graph, errors)
    check_phases(graph, errors)
    check_divergences(graph, errors)
    check_schema_dups(graph, errors)
    return errors


# ── projections ────────────────────────────────────────────────────────────────
def _route(graph, rid):
    return next((r for r in graph["routes"] if r["id"] == rid), None)


def build_full_json(graph: dict) -> str:
    return json.dumps({"version": 1, "nodes": graph}, indent=2, sort_keys=True) + "\n"


_HIGH = ("high", "critical")


def _first_sentence(text: str) -> str:
    """First sentence of a (possibly multi-line) summary — the reduced tier keeps the
    headline; the full prose rides topology.full.json."""
    flat = " ".join((text or "").split())
    m = re.search(r"^(.*?[.!?])(\s|$)", flat)
    return m.group(1) if m else flat


def _in_reduced(item: dict) -> bool:
    """Reduced-tier projection rule: pin reduced_tier=always; drop reduced_tier=never;
    otherwise keep only high/critical-risk nodes (low/medium ride the full tier)."""
    rt = item.get("reduced_tier")
    if rt == "always":
        return True
    if rt == "never":
        return False
    return item.get("risk_level") in _HIGH


def _schema_field_line(model: str, field: dict) -> str:
    semantic = field.get("semantic") or "none"
    return (f"- {model}.{field['name']}: {field.get('type', '?')} — "
            f"{field.get('meaning', '')}  [semantic: {semantic}]")


def _append_reduced_schema_block(out: list[str], graph: dict) -> None:
    always_fields: dict[str, dict] = {}
    for node in graph.get("schema", []):
        model = node.get("model")
        if not model:
            continue
        for field in node.get("fields") or []:
            if field.get("dup_tier") == "always":
                always_fields[f"{model}.{field.get('name')}"] = {"model": model, "field": field}
    if not always_fields:
        return

    all_clusters = check_schema_dups(graph, [])
    reduced_clusters = {
        tag: [ref for ref in refs if ref in always_fields]
        for tag, refs in all_clusters.items()
    }
    reduced_clusters = {tag: refs for tag, refs in reduced_clusters.items() if refs}
    if not reduced_clusters:
        return

    out.append("## SCHEMA SSOT — bible data-models: duplication-prone fields")
    out.append("")
    out.append("### Semantic Cluster Index")
    for tag, refs in reduced_clusters.items():
        out.append(f"- semantic: {tag}")
        for ref in refs:
            item = always_fields[ref]
            out.append(f"  {_schema_field_line(item['model'], item['field'])}")
    out.append("")


def build_reduced(graph: dict) -> str:
    caps = manifest_capabilities()
    out: list[str] = []
    out.append("# TOPOLOGY_REDUCED — load every Recoil session  (AUTO-GENERATED — do not edit; "
               "`build_topology.py --write`)")
    out.append("")
    # 1. DIVERGENCES first — the anti-PR135 lead. EVERY divergence appears in the table AND
    #    carries its full invariant prose below it (JT "let it breathe" — every fork's WHY is
    #    in-context, not just the critical ones).
    out.append("## ⚠ DIVERGENCES — capabilities that FORK by surface/flag. Read before touching them.")
    out.append("")
    out.append("| capability | fork | route A | route B | gate |")
    out.append("|---|---|---|---|---|")
    inv_lines: list[str] = []
    for d in graph["divergences"]:
        rs = [_route(graph, rid) for rid in d.get("forks", [])]
        rs = [r for r in rs if r]

        def _cell(r):
            return f"{r['surface']} → {r['ref_kind']} via {r['resolves_via'].split('::')[-1]}" if r else "?"
        a = _cell(rs[0]) if len(rs) > 0 else "?"
        b = _cell(rs[1]) if len(rs) > 1 else "?"
        if len(rs) > 2:
            b += f"  (+{len(rs) - 2} more routes)"
        out.append(f"| {d['capability']} | {d['id']} | {a} | {b} | {d.get('on_flag') or '—'} |")
        if d.get("invariant"):
            inv_lines.append(f"> **{d['id']}** invariant: {d['invariant']}")
    out.append("")
    out.extend(inv_lines)
    out.append("")
    _append_reduced_schema_block(out, graph)
    # 2. Entrypoints — ALL (the index is compact + high-value; breathe).
    eps = list(graph["entrypoints"])
    if eps:
        out.append("## Entrypoints")
        for e in eps:
            out.append(f"- `{e['id']}` — {_first_sentence(e.get('summary', ''))}  →  `{e['invoke']}`")
        out.append("")
    # 2b. Flags that FORK a capability — the toggles that change behavior.
    fflags = [f for f in graph.get("flags", []) if f.get("forks_capability")]
    if fflags:
        out.append("## Flags (fork behavior)")
        for f in fflags:
            out.append(f"- `{f['id']}` — forks `{f['forks_capability']}` "
                       f"(default: {f.get('default', '?')})")
        out.append("")
    # 3. Loops — ALL retry/reroll/strategy loops; primary exit condition only.
    lps = list(graph["loops"])
    if lps:
        out.append("## Loops (retry / reroll / strategy)")
        out.append("| loop | kind | trigger | bound | primary exit |")
        out.append("|---|---|---|---|---|")
        for lp in lps:
            bound = lp.get("bound") or {}
            bstr = str(bound.get("max", "—")) if bound else "—"
            exit0 = _first_sentence((lp.get("exit") or ["—"])[0])
            out.append(f"| {lp['id']} | {lp['kind']} | {_first_sentence(lp['trigger'])} | "
                       f"{bstr} | {exit0} |")
        out.append("")
    # 4. Phases (spine) — compact: the flow line + a one-sentence each (full detail in investor/full).
    if graph["phases"]:
        out.append("## Phases (spine)")
        out.append("`" + " → ".join(p["id"] for p in graph["phases"]) + "`")
        for p in graph["phases"]:
            out.append(f"- `{p['id']}`: {_first_sentence(p['summary'])}")
        out.append("")
    # 5. Capabilities (canonical + state from manifest)
    out.append("## Capabilities (canonical home + lifecycle — SSOT: ssot_manifest.yaml)")
    for c in graph["capabilities"]:
        m = caps.get(c["id"], {})
        canon = m.get("canonical_target") or m.get("canonical") or "?"
        state = m.get("state", "?")
        out.append(f"- `{c['id']}` → {canon}  [{state}]")
        if m.get("deprecated_paths"):
            out.append(f"   deprecated: {', '.join(s.split('::')[-1] for s in m['deprecated_paths'])}")
    out.append("")
    return "\n".join(out)


def _mid(s: str) -> str:
    return re.sub(r"\W", "_", s)


def build_engineering(graph: dict) -> str:
    out = ["# Recoil engine topology — engineering view (AUTO-GENERATED)", ""]
    out.append("Nodes: " + ", ".join(f"{t}={len(graph[t])}" for t in NODE_TYPES))
    out.append("")
    # Mermaid: divergence-centric flowchart (capability → divergence → forked routes) —
    # deterministic, drift-checked. Renders the engine's fork map at a glance (SYNTHESIS §2/§9).
    out.append("```mermaid")
    out.append("flowchart LR")
    for d in graph["divergences"]:
        did, cid = _mid(d["id"]), _mid(d["capability"])
        out.append(f"  {cid}([{d['capability']}]) --> {did}{{{d['id']}}}")
        for rid in d.get("forks", []):
            r = _route(graph, rid)
            if r:
                lbl = f"{r['surface']}/{r['ref_kind']}"
                out.append(f"  {did} -->|{lbl}| {_mid(rid)}[{r['resolves_via'].split('::')[-1]}]")
    out.append("```")
    out.append("")
    for d in graph["divergences"]:
        out.append(f"## divergence: {d['id']}  (capability: {d['capability']})")
        out.append(f"- risk: {d.get('risk', '')}")
        out.append(f"- invariant: {d['invariant']}")
        for rid in d.get("forks", []):
            r = _route(graph, rid)
            if r:
                out.append(f"- route `{rid}`: surface={r['surface']} kind={r['ref_kind']} "
                           f"entry={r.get('entry', '?')} resolves_via={r['resolves_via']} "
                           f"gated_by={r.get('gated_by')}")
        out.append("")
    return "\n".join(out)


def build_investor(graph: dict) -> str:
    out = ["# Recoil visual engine — overview (AUTO-GENERATED, phase-level)", ""]
    out.append(f"The engine spans {len(manifest_capabilities())} capabilities. "
               f"Mapped so far: {len(graph['capabilities'])} capabilities, "
               f"{len(graph['entrypoints'])} entrypoints, {len(graph['loops'])} feedback loops, "
               f"{len(graph['divergences'])} architectural divergences.")
    out.append("")
    if graph["phases"]:
        spine = " → ".join(p["id"] for p in graph["phases"])
        out.append("## Pipeline spine")
        out.append("")
        out.append(f"`{spine}`")
        out.append("")
        arts = {a["id"]: a for a in graph["artifacts"]}
        for p in graph["phases"]:
            out.append(f"### {p['id'].replace('_', ' ').title()}")
            out.append(p["summary"].strip())
            produces = [arts.get(aid, {}).get("path", aid) for aid in (p.get("produces") or [])]
            if produces:
                out.append(f"- produces: {', '.join(produces)}")
            out.append(f"- next: {' → '.join(p.get('next') or []) or '(terminal)'}")
            out.append("")
    return "\n".join(out)


def build_drift(graph: dict) -> str:
    counts = {t: len(graph[t]) for t in NODE_TYPES}
    return json.dumps({"node_counts": counts,
                       "capabilities_mapped": sorted(c["id"] for c in graph["capabilities"]),
                       "manifest_capabilities": sorted(manifest_capabilities())},
                      indent=2, sort_keys=True) + "\n"


def _is_base_model_class(cls: ast.ClassDef) -> bool:
    return any((isinstance(base, ast.Name) and base.id == "BaseModel")
               or (isinstance(base, ast.Attribute) and base.attr == "BaseModel")
               for base in cls.bases)


def _is_dataclass_model(cls: ast.ClassDef) -> bool:
    for deco in cls.decorator_list:
        target = deco.func if isinstance(deco, ast.Call) else deco
        if (isinstance(target, ast.Name) and target.id == "dataclass") or (
            isinstance(target, ast.Attribute) and target.attr == "dataclass"
        ):
            return True
    return False


def _class_fields(cls: ast.ClassDef) -> list[str]:
    return [
        stmt.target.id
        for stmt in cls.body
        if isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name)
    ]


def _annotation_model_refs(annotation: ast.AST, model_names: set[str]) -> set[str]:
    refs: set[str] = set()
    for node in ast.walk(annotation):
        if isinstance(node, ast.Name) and node.id in model_names:
            refs.add(node.id)
        elif isinstance(node, ast.Attribute) and node.attr in model_names:
            refs.add(node.attr)
        elif isinstance(node, ast.Constant) and isinstance(node.value, str):
            try:
                refs.update(_annotation_model_refs(ast.parse(node.value, mode="eval").body, model_names))
            except SyntaxError:
                if node.value in model_names:
                    refs.add(node.value)
    return refs


def _schema_model_inventory() -> list[dict]:
    """Live Bible/ref schema boundary.

    render_schema.py is bounded to model classes transitively reachable from GlobalBible.
    ref_types.py contributes its dataclass model classes. Enums, Literals, and scalars are
    intentionally ignored because they are field types, not topology schema nodes.
    """
    render_rel = Path("recoil/pipeline/_lib/render_schema.py")
    ref_rel = Path("recoil/core/ref_types.py")
    render_tree = ast.parse((REPO_ROOT / render_rel).read_text())
    ref_tree = ast.parse((REPO_ROOT / ref_rel).read_text())

    render_models = {
        cls.name: cls
        for cls in ast.walk(render_tree)
        if isinstance(cls, ast.ClassDef) and _is_base_model_class(cls)
    }
    reachable: set[str] = set()
    pending = ["GlobalBible"] if "GlobalBible" in render_models else []
    while pending:
        name = pending.pop()
        if name in reachable:
            continue
        reachable.add(name)
        cls = render_models[name]
        for stmt in cls.body:
            if isinstance(stmt, ast.AnnAssign):
                for ref in sorted(_annotation_model_refs(stmt.annotation, set(render_models))):
                    if ref not in reachable:
                        pending.append(ref)

    ref_models = {
        cls.name: cls
        for cls in ast.walk(ref_tree)
        if isinstance(cls, ast.ClassDef) and _is_dataclass_model(cls)
    }

    inventory: list[dict] = []
    for name, cls in sorted(render_models.items(), key=lambda item: item[1].lineno):
        if name in reachable:
            inventory.append({"model": name, "file": str(render_rel), "fields": _class_fields(cls)})
    for name, cls in sorted(ref_models.items(), key=lambda item: item[1].lineno):
        inventory.append({"model": name, "file": str(ref_rel), "fields": _class_fields(cls)})
    return inventory


def schema_unmodeled_warnings(graph: dict) -> list[str]:
    """WARN payload for Bible/ref model classes or fields absent from schema nodes."""
    modeled = {
        node.get("model"): {field.get("name") for field in (node.get("fields") or [])}
        for node in graph.get("schema", [])
    }
    warnings: list[str] = []
    for item in _schema_model_inventory():
        model = item["model"]
        if model not in modeled:
            warnings.append(f"schema class missing: {model} ({item['file']})")
            continue
        present = modeled[model]
        for field in item["fields"]:
            if field not in present:
                warnings.append(f"schema field missing: {model}.{field} ({item['file']})")
    return warnings


def build_design_brief(graph: dict) -> str:
    """Mid-altitude orientation doc for the Claude-Design Atlas — generated from the
    topology v2 SSOT so the Atlas inputs can't drift (the de-fork: the Atlas consumes
    this projection, it never hand-copies node data). Sits between investor (phase
    spine) and engineering (per-route node dump): purpose + spine + the load-bearing
    divergences with their invariants + what's mapped vs in-flight. Leads with the
    bundle manifest (which generated file feeds which Atlas tab) so a consumer can
    repoint the Atlas at real data without guessing."""
    caps_mapped = {c["id"] for c in graph["capabilities"]}
    caps_all = set(manifest_capabilities())
    unmapped = sorted(caps_all - caps_mapped)
    out = ["# Recoil engine — Claude-Design Atlas brief (AUTO-GENERATED — do not hand-edit)", ""]
    out.append("Generated by `build_topology.py` from the topology v2 SSOT "
               "(`recoil/architecture/topology/`). The Claude-Design Atlas consumes the files "
               "below as a generated projection — it must NEVER hand-copy node data (that "
               "re-forks the graph). Regenerate with `build_topology.py --write`.")
    out.append("")
    out.append("## Atlas bundle — which generated file feeds which tab")
    out.append("")
    out.append("| Atlas tab | Source (`generated/`) | Altitude |")
    out.append("|---|---|---|")
    out.append("| Investor glance | `topology.investor.md` | phase spine, no symbols |")
    out.append("| Design brief (this doc) | `topology.design_brief.md` | capability + divergences, narrative |")
    out.append("| Technical flowchart | `topology.engineering.md` | per-route + Mermaid fork map |")
    out.append("| Teardown / interactive | `atlas.render.json` (built by tools/build_atlas_graph.py from topology.full.json + _render_overlay.yaml) | all render nodes |")
    out.append("")
    if graph["phases"]:
        spine = " → ".join(p["id"] for p in graph["phases"])
        out.append("## Pipeline spine")
        out.append("")
        out.append(f"`{spine}`")
        out.append("")
        for p in graph["phases"]:
            out.append(f"- **{p['id'].replace('_', ' ').title()}** — {p['summary'].strip()}")
        out.append("")
    out.append("## Coverage")
    out.append("")
    out.append(f"- {len(caps_mapped)} of {len(caps_all)} capabilities mapped · "
               f"{len(graph['entrypoints'])} entrypoints · {len(graph['loops'])} feedback loops · "
               f"{len(graph['divergences'])} architectural divergences.")
    if unmapped:
        out.append(f"- In-flight (not yet modeled): {', '.join(unmapped)}.")
    out.append("")
    out.append("## Load-bearing divergences (forks a redesign must not silently break)")
    out.append("")
    out.append("Each is a capability that forks by surface/flag; the **invariant** is the property "
               "that must hold across both routes. Full route detail is in the technical/teardown tabs.")
    out.append("")
    for d in graph["divergences"]:
        routes = []
        for rid in d.get("forks", []):
            r = _route(graph, rid)
            if r:
                routes.append(f"`{r['surface']}/{r['ref_kind']}`")
        out.append(f"### {d['id']}  ·  capability: {d['capability']}")
        if d.get("risk"):
            out.append(f"- risk: {d['risk']}")
        out.append(f"- invariant: {d['invariant']}")
        if routes:
            out.append(f"- routes: {' vs '.join(routes)}")
        out.append("")
    return "\n".join(out)


GENERATORS = {
    "topology.full.json": build_full_json,
    "TOPOLOGY_REDUCED.md": build_reduced,
    "topology.engineering.md": build_engineering,
    "topology.investor.md": build_investor,
    "topology.design_brief.md": build_design_brief,
    "topology.drift.json": build_drift,
}


def generate_all(graph: dict) -> dict[str, str]:
    return {name: gen(graph) for name, gen in GENERATORS.items()}


def token_proxy(text: str) -> int:
    return len(text) // 4


def unmodeled_entrypoint_warnings(graph: dict) -> list[str]:
    """WARN (never fail): CLI files under pipeline/cli + tools that look invocable
    (__main__ / argparse) but aren't referenced by any topology entrypoint `file`.
    A nudge toward coverage; full import-graph drift is a documented follow-up."""
    modeled = {(e.get("file") or "").split("::")[0] for e in graph["entrypoints"]}
    warns: list[str] = []
    # Scope to the canonical CLI dir only — recoil/pipeline/tools is a one-off junk drawer
    # whose utilities aren't engine entrypoints; warning on all of them is pure noise.
    for sub in ("recoil/pipeline/cli",):
        d = REPO_ROOT / sub
        if not d.is_dir():
            continue
        for p in sorted(d.glob("*.py")):
            rel = str(p.relative_to(REPO_ROOT))
            if rel in modeled or p.name.startswith("_"):
                continue
            body = p.read_text(errors="ignore")
            if '__main__' in body or 'argparse' in body or 'click' in body:
                warns.append(f"unmodeled CLI entrypoint (not in topology): {rel}")
    return warns


def _symref_parts(raw: str | None) -> tuple[str, str]:
    path, sep, sym = (raw or "").partition("::")
    return path, sym if sep else ""


def _modeled_files(graph: dict) -> set[str]:
    files: set[str] = set()
    for ntype, fields in SYMREF_FIELDS.items():
        if ntype == "schema":
            continue
        for item in graph.get(ntype, []):
            for field in fields:
                path, _ = _symref_parts(item.get(field))
                if path:
                    files.add(path)
    return files


def _modeled_symbol_keys(graph: dict) -> set[str]:
    modeled: set[str] = set()

    def add(raw: str | None, symbol: str | None = None) -> None:
        path, raw_sym = _symref_parts(raw)
        sym = raw_sym or symbol or ""
        if path and sym:
            modeled.add(f"{path}::{sym}")

    for ntype in ("symbols", "entrypoints"):
        for item in graph.get(ntype, []):
            add(item.get("file"), item.get("symbol"))

    for ntype in ("routes", "loops", "flags"):
        for item in graph.get(ntype, []):
            for field in SYMREF_FIELDS.get(ntype, []):
                add(item.get(field))

    return modeled


def _has_main_cli_marker(body: str) -> bool:
    return "__main__" in body and ("argparse" in body or "click" in body)


def _surface_keys(graph: dict) -> tuple[set[str], list[str]]:
    keys: set[str] = set()
    scan_skips: list[str] = []
    for file in sorted(_modeled_files(graph)):
        fpath = REPO_ROOT / file
        try:
            body = fpath.read_text(errors="ignore")
            tree = ast.parse(body)
        except (OSError, SyntaxError) as exc:
            scan_skips.append(f"{file} ({type(exc).__name__})")
            continue

        for node in tree.body:
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and not node.name.startswith("_"):
                keys.add(f"{file}::{node.name}")
        if _has_main_cli_marker(body):
            keys.add(f"{file}::__main__")
    return keys, sorted(scan_skips)


def _load_unmodeled_allowlist(path: Path | None = None) -> set[str]:
    allowlist_path = path or UNMODELED_ALLOWLIST_PATH
    try:
        lines = allowlist_path.read_text(errors="ignore").splitlines()
    except OSError:
        return set()
    return {line.strip() for line in lines if line.strip() and not line.strip().startswith("#")}


def unmodeled_surface_warnings(graph: dict, allowlist: set[str] | None = None) -> list[str]:
    modeled_symbols = _modeled_symbol_keys(graph)
    if allowlist is None:
        allowlist = _load_unmodeled_allowlist()
    keys, scan_skips = _surface_keys(graph)
    warns = [
        f"unmodeled surface (modeled file, surface absent from topology): {k}"
        for k in sorted(keys - modeled_symbols - allowlist)
    ]
    warns.extend(f"unmodeled-surface scan skipped: {s}" for s in scan_skips)
    return warns


def write_surface_baseline(graph: dict, path: Path | None = None) -> int:
    keys, scan_skips = _surface_keys(graph)
    if scan_skips:
        return -1
    baseline_path = path or UNMODELED_ALLOWLIST_PATH
    entries = sorted(keys - _modeled_symbol_keys(graph))
    header = (
        "# GENERATED baseline (build_topology.py --write-surface-baseline). Each line is an intentionally-unmodeled\n"
        "# surface accepted at baseline time. Re-run after triage to shrink. Do NOT hand-curate casually.\n"
    )
    baseline_path.parent.mkdir(parents=True, exist_ok=True)
    baseline_path.write_text(header + "".join(f"{entry}\n" for entry in entries))
    return len(entries)


def _print_topology_errors(errors: list[str]) -> None:
    print("TOPOLOGY ERRORS:", file=sys.stderr)
    for e in errors:
        print(f"  - {e}", file=sys.stderr)


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    g = ap.add_mutually_exclusive_group(required=True)
    g.add_argument("--write", action="store_true", help="regenerate generated/")
    g.add_argument("--check", action="store_true", help="CI: fail on drift / broken refs")
    g.add_argument("--write-surface-baseline", action="store_true",
                   help="write the current unmodeled-surface allowlist baseline")
    args = ap.parse_args()

    graph, errors = load_graph()
    errors.extend(_validate_graph(graph))

    if args.write_surface_baseline:
        if errors:
            _print_topology_errors(errors)
            return 1
        n = write_surface_baseline(graph)
        if n < 0:
            _, scan_skips = _surface_keys(graph)
            print("cannot baseline: unscannable modeled file(s):", file=sys.stderr)
            for s in scan_skips:
                print(f"  - {s}", file=sys.stderr)
            return 1
        print(f"wrote {n} baseline entries to {UNMODELED_ALLOWLIST_PATH}")
        return 0

    for w in unmodeled_entrypoint_warnings(graph):
        print(f"  topology WARN: {w}", file=sys.stderr)
    for w in schema_unmodeled_warnings(graph):
        print(f"  topology WARN: {w}", file=sys.stderr)
    surface_warnings = unmodeled_surface_warnings(graph)
    for w in surface_warnings:
        print(f"  topology WARN: {w}", file=sys.stderr)
    print(f"topology: {len(surface_warnings)} unmodeled surface warning(s) "
          "(advisory; see unmodeled_surfaces_allowlist)", file=sys.stderr)
    if errors:
        _print_topology_errors(errors)
        return 1

    outputs = generate_all(graph)
    reduced_tok = token_proxy(outputs["TOPOLOGY_REDUCED.md"])
    if reduced_tok > REDUCED_TOKEN_CAP:
        print(f"TOPOLOGY ERROR: TOPOLOGY_REDUCED.md token proxy {reduced_tok} > {REDUCED_TOKEN_CAP}",
              file=sys.stderr)
        return 1
    if "## ⚠ DIVERGENCES" not in outputs["TOPOLOGY_REDUCED.md"]:
        print("TOPOLOGY ERROR: reduced tier missing the DIVERGENCES lead table", file=sys.stderr)
        return 1
    if "entry=?" in outputs["topology.engineering.md"]:
        print("TOPOLOGY ERROR: a route renders entry=? — every route needs a real entry",
              file=sys.stderr)
        return 1

    GEN_DIR.mkdir(parents=True, exist_ok=True)
    if args.write:
        for name, text in outputs.items():
            (GEN_DIR / name).write_text(text)
        print(f"wrote {len(outputs)} files to {GEN_DIR.relative_to(REPO_ROOT)} | "
              f"reduced ~{reduced_tok} tok | divergences={len(graph['divergences'])} "
              f"capabilities={len(graph['capabilities'])}/{len(manifest_capabilities())}")
        return 0

    # --check
    drift = []
    for name, text in outputs.items():
        cur = GEN_DIR / name
        if not cur.exists() or cur.read_text() != text:
            drift.append(name)
    if drift:
        print(f"TOPOLOGY DRIFT: stale/missing generated files: {', '.join(drift)} "
              f"— run build_topology.py --write", file=sys.stderr)
        return 1
    print(f"topology OK | reduced ~{reduced_tok} tok | no drift | "
          f"divergences={len(graph['divergences'])}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
