#!/usr/bin/env bash
# spec_selfgate.sh — run the EXACT harness codex spec-review locally, BEFORE dispatch.
#
# WHY (2026-06-18): /spec produced specs that passed its Opus self-review but then
# CAPPED on the harness gate, which uses codex gpt-5.5 (high effort) GROUNDED against
# the live worktree. Each cap burned a full worktree + harness spin-up (5 avoidable
# caps across REC-176 + REC-178 in one night). The harness gate is the real bar; this
# runs the IDENTICAL review locally so a spec is converged to VERDICT: READY for free,
# before any dispatch. This is the /spec self-gate AND the convergence engine of
# /orchestrate. Keep this byte-aligned with harness_orchestrator.sh::run_codex_spec_review_gate.
#
# Usage:
#   spec_selfgate.sh <BUILD_SPEC.md> [--dir <repo-root>]
# Exit: 0 = VERDICT: READY ; 1 = NEEDS-FIXES ; 2 = degenerate/usage/tooling error.
# Output: the full codex review (CRITICAL/MAJOR/MINOR + VERDICT) to stdout.
set -uo pipefail

SPEC_FILE=""
WORKING_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"  # repo root (…/CLAUDE_PROJECTS)
while [ $# -gt 0 ]; do
    case "$1" in
        --dir) WORKING_DIR="$2"; shift 2 ;;
        --findings-out) FINDINGS_OUT="$2"; shift 2 ;;
        -h|--help) echo "Usage: $0 <BUILD_SPEC.md> [--dir <repo-root>] [--findings-out <path>]"; exit 2 ;;
        *) SPEC_FILE="$1"; shift ;;
    esac
done
FINDINGS_OUT="${FINDINGS_OUT:-}"
[ -n "$SPEC_FILE" ] || { echo "ERROR: BUILD_SPEC path required" >&2; exit 2; }
[ -f "$SPEC_FILE" ] || { echo "ERROR: spec not found: $SPEC_FILE" >&2; exit 2; }
SPEC_FILE="$(cd "$(dirname "$SPEC_FILE")" && pwd)/$(basename "$SPEC_FILE")"

# ── Deterministic dispatchability pre-check (runs BEFORE any codex spend) ──────────
# Predicts dispatchability, not just review-readiness. Catches two failure classes
# that each wasted a full build (both invisible to the codex prose review):
#   REC-199: the harness phase-parser reads ONLY `## Phase <N>:` headings + `### Validation`
#            blocks; `### PHASE A` / level-3 / `**Gate:**` aborts at dispatch ("No phases found").
#   REC-201: a gate bash block that is a SHELL SYNTAX error (e.g. a heredoc placed
#            mid-`&&`-chain) — the harness freezes the val_cmd at dispatch and can't
#            self-recover, so it caps even when the code is correct. `bash -n` catches it.
if ! SPEC_FILE="$SPEC_FILE" python3 - <<'PRECHECK'
import os, re, subprocess, sys
# Faithfully MIRROR harness_orchestrator.sh so the precheck predicts the harness, never
# rejects a spec the harness would accept: the EXACT phase regex (:1166) + the 3-tier
# get_validation_cmd extraction (:304) — then bash -n ONLY the command the harness will run.
lines = open(os.environ["SPEC_FILE"], encoding="utf-8").read().splitlines()
errs = []
PHASE_RE = re.compile(r"^## Phase [0-9]+(\.[0-9]+)?[a-z]?[: ]")   # harness_orchestrator.sh:1166
phase_idxs = [i for i, l in enumerate(lines) if PHASE_RE.match(l)]
if not phase_idxs:
    errs.append("no headings match the harness phase regex `^## Phase [0-9]+(\\.[0-9]+)?[a-z]?[: ]` "
                "— harness aborts 'No phases found' at dispatch (e.g. `### PHASE A`/level-3/letter-only ids).")

def phase_content(start):
    end = next((j for j in range(start + 1, len(lines)) if lines[j].startswith("## ")), len(lines))
    return lines[start:end]

def _fenced(content, header_pred):
    flag = code = False; out = []
    for l in content:
        if not flag and header_pred(l): flag = True; continue
        if flag and re.match(r"^```bash", l): code = True; continue
        if code and l.startswith("```"): break
        if code: out.append(l)
    return "\n".join(out)

GLOBAL_CMD = _fenced(lines, lambda l: l.startswith("## Validation command"))   # tier 3

def validation_cmd(content):
    cmd = _fenced(content, lambda l: l.startswith("### Validation"))            # tier 1: fenced
    if cmd: return cmd
    for l in content:                                                          # tier 2: inline
        m = re.match(r"^### Validation:\s*(\S.*)", l) or re.match(r"^\*\*Validation:\*\*\s*(\S.*)", l)
        if m: return m.group(1).replace("`", "")
    return GLOBAL_CMD                                                          # tier 3: global fallback

for start in phase_idxs:
    head = lines[start][:46]
    cmd = validation_cmd(phase_content(start))
    if not cmd.strip():
        continue   # harness runs no gate for this phase — not our call to block (conservative)
    r = subprocess.run(["bash", "-n"], input=cmd, text=True, capture_output=True)
    if r.returncode != 0:
        tail = (r.stderr.strip().splitlines() or ["bash -n failed"])[-1]
        errs.append(f"{head!r}: validation gate is a SHELL SYNTAX ERROR (harness freezes the val_cmd at "
                    f"dispatch — can't self-recover) — {tail}")
if errs:
    sys.stderr.write("[spec_selfgate] PRECHECK FAILED (dispatchability) — fix before the codex gate:\n")
    for e in errs: sys.stderr.write(f"  - {e}\n")
    sys.exit(1)
sys.stderr.write("[spec_selfgate] precheck OK (harness phase-parse + gate bash -n)\n")
PRECHECK
then
    echo "[spec_selfgate] not dispatchable (precheck) — skipping codex gate." >&2
    exit 1
fi

resolve_codex_binary() {
    if [ -n "${HARNESS_CODEX_BIN:-}" ]; then
        [ -x "$HARNESS_CODEX_BIN" ] && { printf '%s' "$HARNESS_CODEX_BIN"; return 0; }
        return 1
    fi
    if [ -x /Applications/Codex.app/Contents/Resources/codex ]; then
        printf '%s' /Applications/Codex.app/Contents/Resources/codex; return 0
    fi
    command -v codex 2>/dev/null
}
CODEX="$(resolve_codex_binary)" || { echo "ERROR: codex binary not found" >&2; exit 2; }

# Core review instruction is byte-IDENTICAL to the harness gate (harness_orchestrator.sh:738)
# so the prose verdict can't drift. When --findings-out is set we APPEND a structured-output
# request (additive — doesn't change the review semantics) so orchestrate_guard gets parseable
# findings. (SYNTHESIS orchestrate-engine-2026-06-18 Q1: the reviewer makes the kind judgment.)
prompt="Adversarial spec-review of $SPEC_FILE. Emit CRITICAL/MAJOR/MINOR findings; the FINAL line MUST be exactly 'VERDICT: READY' or 'VERDICT: NEEDS-FIXES'."
prompt="$prompt"$'\n'"The spec file to review is exactly: $SPEC_FILE"$'\n'"Read that absolute path. Do NOT read BUILD_SPEC.md from the current working directory."
if [ -n "$FINDINGS_OUT" ]; then
    prompt="$prompt"$'\n\nADDITIONALLY, BEFORE the final VERDICT line, output a fenced ```json code block (the LAST json block in your reply) of the form: {"verdict":"READY|NEEDS-FIXES","findings":[{"severity":"CRITICAL|HIGH|MEDIUM|LOW","file":"path/relative/to/repo","surface":"the function/symbol/section the finding names","kind":"fixable|scope|design|needs-human"}]} — one entry per finding above. Set kind PRECISELY — the guard acts ONLY on this tag: kind=fixable is auto-converged by an agent (a deterministic edit); kind=design is auto-resolved by the CONDUCTOR — it grounds the live code, picks the option that fits existing patterns, records a REVERSIBLE decision, and re-gates (NOT a human stop); kind=scope and kind=needs-human STOP the loop and surface to the human. Use kind=fixable ONLY when you can name the DETERMINISTIC edit and NO judgment remains — e.g. grounding the real code path, following an existing spec/template/pattern, adding required template sections, correcting a false claim, or strengthening a gate to enforce an ALREADY-SPECIFIED invariant. Use kind=design for an architecture FORK (>=2 viable options, e.g. choosing a storage/API ownership model) that a code-grounded author CAN resolve — the conductor will, reversibly. Use kind=scope for a build-BOUNDARY call (whether work belongs in THIS build — a genuine human decision). Use kind=needs-human for anything needing info/authority/spend or a policy/threshold choice only the human has. DO NOT mark fixable if the finding requires ANY judgment (use design), and DO NOT mark design if the finding is a build-BOUNDARY/authority/spend/policy choice (use scope/needs-human — those are the hard human stops). SAFETY ASYMMETRY: a false design tag costs only one auto-resolve cycle, but a false fixable tag lets a sub-agent change something with NO recorded decision — so WHEN TORN between fixable and design, choose design; WHEN TORN about whether it is a genuine build-boundary/authority decision, choose scope/needs-human. surface must be a stable file:symbol-style label.'
fi
max_attempts=3
attempt=1
# Portable HARD timeout for the codex call (macOS ships no timeout/gtimeout).
# perl's alarm() timer survives exec(), and SIGALRM's default action kills the
# exec'd codex — so a stalled review becomes a bounded, retryable degenerate
# attempt instead of an unbounded hang. A normal gpt-5.5 high-effort review
# returns in ~2-4min; 600s is a generous ceiling. Override via env.
# [hardened via REC-194 dogfood 2026-06-18: a self-gate codex exec slept 32min
#  with ZERO bytes of output, silently freezing the orchestrate run.]
# KNOWN LIMITATION (codex-flagged): SIGALRM kills the exec'd codex process, not its
# process group — descendants could orphan. The observed failure mode was codex ITSELF
# sleeping (no working children), so direct-kill addresses it; full subprocess-lifecycle
# ownership is REC-194's app-server broker (which the migration resolves structurally).
SELFGATE_TIMEOUT="${SPEC_SELFGATE_TIMEOUT:-600}"
while [ "$attempt" -le "$max_attempts" ]; do
    review_output="$(perl -e 'alarm shift @ARGV; exec @ARGV' "$SELFGATE_TIMEOUT" \
        "$CODEX" exec \
        --skip-git-repo-check \
        -s read-only \
        -c model=gpt-5.5 \
        -c model_reasoning_effort=high \
        -C "$WORKING_DIR" \
        "$prompt" </dev/null 2>&1)"
    codex_rc=$?

    verdict="$(printf '%s\n' "$review_output" | grep -E '^VERDICT:' | tail -n 1 || true)"
    byte_count="$(printf '%s' "$review_output" | wc -c | tr -d '[:space:]')"
    # Does the output contain any real review markers anywhere?
    has_markers="$(printf '%s\n' "$review_output" | grep -Eq '^[[:space:]]*(CRITICAL|MAJOR|MINOR|VERDICT):' && echo yes || echo no)"

    # Degenerate-output detection (codex exec intermittently aborts to header/prompt-echo
    # only). NOTE: prompt-echo string-matching was removed — the prompt is now multi-line
    # (--findings-out appends a JSON-format request), which broke `awk -v p="$prompt"`
    # (newline-in-string). The robust signal is small-output + no review markers, which
    # needs no reference to the (possibly multi-line) prompt. [hardened via REC-178 dogfood 2026-06-18]
    # Classify. A COMPLETE review (final VERDICT line + review markers present) is honored
    # EVEN IF the process was SIGALRM-killed afterward: codex emits VERDICT as its LAST line,
    # so a post-verdict 142 is teardown, not a lost review. Only an output with NO usable
    # verdict is treated as a timeout/degenerate. [verdict-before-timeout reorder: the codex
    #  review of this hardening flagged that checking rc==142 FIRST discarded a valid verdict
    #  emitted before the hang — "no valid review lost". 2026-06-18]
    degenerate=false; reason=""
    if [ -n "$verdict" ] && [ "$has_markers" = "yes" ]; then
        : # complete review — honor it regardless of codex_rc (incl. a post-verdict 142)
    elif [ "${codex_rc:-0}" -eq 142 ]; then
        degenerate=true; reason="codex exec timed out after ${SELFGATE_TIMEOUT}s (no complete verdict)"
    elif [ -z "$verdict" ]; then
        degenerate=true; reason="no VERDICT line"
    elif [ "${byte_count:-0}" -lt 4096 ] && [ "$has_markers" = "no" ]; then
        degenerate=true; reason="small output with no review markers (header/prompt echo only)"
    fi

    if $degenerate; then
        echo "[spec_selfgate] attempt $attempt/$max_attempts degenerate ($reason); retrying…" >&2
        attempt=$((attempt + 1)); sleep $((attempt * 3)); continue
    fi

    printf '%s\n' "$review_output"

    # Extract the structured findings block for orchestrate_guard (fail-closed:
    # if no valid json block is present, emit a needs-human finding so the guard STOPs).
    if [ -n "$FINDINGS_OUT" ]; then
        printf '%s' "$review_output" | SPEC_FILE="$SPEC_FILE" FINDINGS_OUT="$FINDINGS_OUT" python3 -c '
import json, os, re, sys
raw = sys.stdin.read()
out = os.environ["FINDINGS_OUT"]; spec = os.environ["SPEC_FILE"]
blocks = re.findall(r"```json\s*(.*?)```", raw, re.DOTALL)
doc = None
for b in reversed(blocks):
    try:
        d = json.loads(b)
        if isinstance(d, dict) and isinstance(d.get("findings"), list):
            doc = d; break
    except Exception:
        continue
if doc is None:
    doc = {"verdict": "NEEDS-FIXES", "findings": [{"severity": "CRITICAL",
           "file": spec, "surface": "findings_extraction", "kind": "needs-human"}],
           "_note": "no valid findings json block in review; fail-closed"}
with open(out, "w", encoding="utf-8") as f:
    json.dump(doc, f, indent=2)
' || echo "[spec_selfgate] findings extraction failed" >&2
    fi

    if printf '%s\n' "$verdict" | grep -q 'VERDICT: READY'; then
        echo "[spec_selfgate] READY ✓ — safe to dispatch." >&2
        exit 0
    fi
    echo "[spec_selfgate] NEEDS-FIXES — address findings above and re-run before dispatch." >&2
    exit 1
done

echo "[spec_selfgate] codex review degenerate after $max_attempts attempts (last: ${reason:-unknown})" >&2
exit 2
