#!/usr/bin/env bash
# Behavioral checks for dispatch_reaper.sh.
set -uo pipefail

HERE="$(cd "$(dirname "$0")" && pwd)"
REAPER="$HERE/../dispatch_reaper.sh"
STATUS="$HERE/../dispatch_status.py"
PASS=0
FAIL=0

ok(){ echo "  OK: $1"; PASS=$((PASS+1)); }
no(){ echo "  FAIL: $1"; FAIL=$((FAIL+1)); }

test -f "$REAPER" || { echo "FATAL: reaper not found at $REAPER"; exit 1; }
test -f "$STATUS" || { echo "FATAL: dispatch_status.py not found at $STATUS"; exit 1; }

SBX="$(mktemp -d)"
SBX="$(cd "$SBX" && pwd -P)"
PIDS_FILE="$SBX/pids"
touch "$PIDS_FILE"
cleanup() {
  while IFS= read -r pid; do
    case "$pid" in *[!0-9]*|"") continue ;; esac
    kill "$pid" >/dev/null 2>&1 || true
  done < "$PIDS_FILE"
  rm -rf "$SBX"
}
trap cleanup EXIT

export HOME="$SBX/home"
mkdir -p "$HOME/.recoil"

BIN="$SBX/bin"
mkdir -p "$BIN"
GIT_LOG="$SBX/git.log"
RESET_LOG="$SBX/reset.log"
TMUX_STATE="$SBX/tmux"
mkdir -p "$TMUX_STATE"
export GIT_LOG RESET_LOG TMUX_STATE PIDS_FILE

cat > "$BIN/git" <<'STUB'
#!/usr/bin/env bash
wt=""
if [ "${1:-}" = "-C" ]; then
  wt="$2"
  shift 2
fi
printf 'git -C %s %s\n' "$wt" "$*" >> "$GIT_LOG"
case "${1:-} ${2:-}" in
  "rev-parse --abbrev-ref")
    printf '%s\n' "${EXPECTED_BRANCH:-codex/REC-77-test}"
    exit 0
    ;;
  "reset --hard")
    printf '%s|%s\n' "$wt" "${3:-}" >> "$RESET_LOG"
    exit 0
    ;;
  *)
    exit 0
    ;;
esac
STUB
chmod +x "$BIN/git"

cat > "$BIN/tmux" <<'STUB'
#!/usr/bin/env bash
state="${TMUX_STATE:?}"
case "${1:-}" in
  has-session)
    shift
    session=""
    while [ "$#" -gt 0 ]; do
      case "$1" in
        -t) session="$2"; shift 2 ;;
        *) shift ;;
      esac
    done
    [ -f "$state/$session.pid" ] || exit 1
    pid="$(cat "$state/$session.pid")"
    kill -0 "$pid" >/dev/null 2>&1
    ;;
  new-session)
    shift
    session=""
    cmd=""
    while [ "$#" -gt 0 ]; do
      case "$1" in
        -d) shift ;;
        -s) session="$2"; shift 2 ;;
        *) cmd="$1"; shift ;;
      esac
    done
    [ -n "$session" ] && [ -n "$cmd" ] || exit 2
    if [ -n "${REAPER_HARNESS:-}" ]; then
      nohup bash "$REAPER_HARNESS" "$session" >/dev/null 2>&1 &
    else
      bash -c "$cmd" >/dev/null 2>&1 &
    fi
    pid="$!"
    printf '%s\n' "$pid" > "$state/$session.pid"
    printf '%s\n' "$pid" >> "${PIDS_FILE:?}"
    exit 0
    ;;
  display-message)
    shift
    session=""
    while [ "$#" -gt 0 ]; do
      case "$1" in
        -p) shift ;;
        -t) session="$2"; shift 2 ;;
        *) shift ;;
      esac
    done
    cat "$state/$session.pid"
    ;;
  *)
    exit 2
    ;;
esac
STUB
chmod +x "$BIN/tmux"

cat > "$BIN/ps" <<'STUB'
#!/usr/bin/env bash
pid=""
field=""
while [ "$#" -gt 0 ]; do
  case "$1" in
    -p) pid="$2"; shift 2 ;;
    -o) field="$2"; shift 2 ;;
    *) shift ;;
  esac
done
[ -n "$pid" ] || exit 1
kill -0 "$pid" >/dev/null 2>&1 || exit 1
case "$field" in
  lstart=)
    printf '%s\n' "Mon Jan  1 00:00:00 2024"
    ;;
  command=)
    session=""
    if [ -d "${TMUX_STATE:-}" ]; then
      for f in "$TMUX_STATE"/*.pid; do
        [ -f "$f" ] || continue
        if [ "$(cat "$f")" = "$pid" ]; then
          session="$(basename "$f" .pid)"
          break
        fi
      done
    fi
    if [ -n "$session" ]; then
      printf 'bash %s %s\n' "${REAPER_HARNESS:-harness_orchestrator.sh}" "$session"
    else
      printf '%s\n' "bash test-dispatch-reaper"
    fi
    ;;
  *)
    exit 1
    ;;
esac
STUB
chmod +x "$BIN/ps"

cat > "$SBX/harness_orchestrator.sh" <<'STUB'
#!/usr/bin/env bash
while :; do sleep 1; done
STUB
chmod +x "$SBX/harness_orchestrator.sh"

export PATH="$BIN:$PATH"
export REAPER_HARNESS="$SBX/harness_orchestrator.sh"
export WATCH_INTERVAL_S=1
export DEAD_THRESHOLD_S=2
export EXPECTED_BRANCH="codex/REC-77-test"

SWSTUB="$SBX/session_workspace.sh"
SWLOG="$SBX/sw.log"
cat > "$SWSTUB" <<'STUB'
#!/usr/bin/env bash
printf '%s\n' "$*" >> "$SWLOG"
# exit code controlled by SW_REAP_RC (default 0 = reaped ok)
exit "${SW_REAP_RC:-0}"
STUB
chmod +x "$SWSTUB"
export SESSION_WORKSPACE="$SWSTUB" SWLOG

# Notify stub: record every notify_run invocation so we can assert the
# worktree-kept recovery notification fires when a reap is unsafe.
NOTIFYSTUB="$SBX/dispatch_notify.sh"
NOTIFYLOG="$SBX/notify.log"
cat > "$NOTIFYSTUB" <<'STUB'
#!/usr/bin/env bash
printf '%s\n' "$*" >> "$NOTIFYLOG"
exit 0
STUB
chmod +x "$NOTIFYSTUB"
export NOTIFY_TOOL="$NOTIFYSTUB" NOTIFYLOG

new_runs_root() {
  DISPATCH_RUNS_ROOT="$SBX/runs-$1"
  export DISPATCH_RUNS_ROOT
  mkdir -p "$DISPATCH_RUNS_ROOT"
  : > "$RESET_LOG"
  : > "$GIT_LOG"
  rm -rf "$TMUX_STATE"
  mkdir -p "$TMUX_STATE"
}

run_reaper() {
  bash "$REAPER" >/dev/null 2>&1
}

init_run() {
  local name="$1" worktree="$2" spec="$3"
  local run_dir="$DISPATCH_RUNS_ROOT/$name"
  mkdir -p "$worktree"
  printf '# spec\n' > "$spec"
  python3 "$STATUS" init \
    --run-dir "$run_dir" \
    --issue REC-77 \
    --branch codex/REC-77-test \
    --worktree "$worktree" \
    --spec "$spec" \
    --last-validated-commit abc123 >/dev/null
  printf '%s\n' "$run_dir"
}

status_value() {
  python3 - "$1/status.json" "$2" <<'PY'
import json
import sys
with open(sys.argv[1], "r", encoding="utf-8") as handle:
    data = json.load(handle)
value = data
for part in sys.argv[2].split("."):
    value = value.get(part) if isinstance(value, dict) else None
print("" if value is None else value)
PY
}

patch_status() {
  local run_dir="$1" code="$2"
  python3 - "$run_dir/status.json" "$code" <<'PY'
import json
import sys
from pathlib import Path
path = Path(sys.argv[1])
data = json.loads(path.read_text(encoding="utf-8"))
exec(sys.argv[2], {"data": data})
path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY
}

write_terminal() {
  local path="$1" sig="$2" cause="${3:-validation_failure}" converge="${4:-FAILED}" exit_code="${5:-1}"
  mkdir -p "$(dirname "$path")"
  python3 - "$path" "$sig" "$cause" "$converge" "$exit_code" <<'PY'
import json
import sys
payload = {
    "run_id": "REC-77",
    "attempt": 1,
    "exit_code": int(sys.argv[5]),
    "converge_status": sys.argv[4],
    "phase": "phase-4",
    "gate": "validation",
    "validation_command": "bash test_dispatch_reaper.sh",
    "failing_test_ids": ["test_reaper"],
    "convergence_verdict_summary": "failed",
    "normalized_failure_excerpt": "",
    "failure_signature": sys.argv[2] or None,
    "cause_hint": sys.argv[3],
    "pr_url": "https://github.example/pr/1",
    "branch": "codex/REC-77-test",
    "commit": "def456",
    "log_path": "/tmp/build-log.md",
    "written_at": "2026-06-06T00:00:00Z",
}
with open(sys.argv[1], "w", encoding="utf-8") as handle:
    json.dump(payload, handle, indent=2, sort_keys=True)
    handle.write("\n")
PY
}

write_heartbeat() {
  local path="$1" run_id="$2" attempt="$3" pid="$4" start_time="$5" session="$6" updated="$7" ps_match="${8:-harness_orchestrator.sh}"
  mkdir -p "$(dirname "$path")"
  python3 - "$path" "$run_id" "$attempt" "$pid" "$start_time" "$session" "$updated" "$ps_match" <<'PY'
import json
import sys
payload = {
    "run_id": sys.argv[2],
    "attempt": int(sys.argv[3]),
    "orchestrator_pid": int(sys.argv[4]),
    "orchestrator_start_time": sys.argv[5],
    "tmux_session": sys.argv[6],
    "ps_match": sys.argv[8],
    "log_path": "/tmp/build-log.md",
    "phase_hint": None,
    "round_hint": None,
    "updated_at": sys.argv[7],
}
with open(sys.argv[1], "w", encoding="utf-8") as handle:
    json.dump(payload, handle, indent=2, sort_keys=True)
    handle.write("\n")
PY
}

old_iso() {
  python3 - <<'PY'
import datetime as dt
print((dt.datetime.now(dt.timezone.utc) - dt.timedelta(seconds=60)).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
PY
}

fresh_iso() {
  python3 - <<'PY'
import datetime as dt
print(dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
PY
}

mtime() {
  python3 - "$1" <<'PY'
import os
import sys
print(os.path.getmtime(sys.argv[1]))
PY
}

# terminal TRANSIENT under cap -> re-dispatch and reset to last_validated_commit
new_runs_root transient
RUN="$(init_run REC-77-transient "$SBX/wt-transient" "$SBX/spec-transient.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_terminal "$RUN/attempt-001/terminal_status.json" "sha256:transient"
run_reaper
[ "$(status_value "$RUN" state)" = "ATTEMPT_RUNNING" ] && [ "$(status_value "$RUN" attempt)" = "2" ] \
  && ok "terminal TRANSIENT re-dispatches" || no "terminal TRANSIENT did not re-dispatch"
grep -q "abc123" "$RESET_LOG" \
  && ok "re-dispatch reset --hard uses last_validated_commit" \
  || no "re-dispatch did not reset to last_validated_commit"

# repeated signature -> deterministic cap with no re-dispatch
new_runs_root deterministic
RUN="$(init_run REC-77-deterministic "$SBX/wt-deterministic" "$SBX/spec-deterministic.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
patch_status "$RUN" 'data["prior_failure_signatures"]=["sha256:repeat"]'
write_terminal "$RUN/attempt-001/terminal_status.json" "sha256:repeat"
run_reaper
[ "$(status_value "$RUN" state)" = "CAPPED_NEEDS_HUMAN" ] \
  && ok "repeated signature caps as deterministic" \
  || no "repeated signature did not cap"
[ ! -s "$RESET_LOG" ] && ok "deterministic failure does not re-dispatch" || no "deterministic failure re-dispatched"

# budget exceeded -> cap
new_runs_root budget
RUN="$(init_run REC-77-budget "$SBX/wt-budget" "$SBX/spec-budget.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
patch_status "$RUN" 'data["budget"]["wall_clock_s_max"]=1; data["created_at"]="2020-01-01T00:00:00Z"'
run_reaper
[ "$(status_value "$RUN" state)" = "CAPPED_NEEDS_HUMAN" ] \
  && ok "budget exceeded caps" || no "budget exceeded did not cap"

# fresh STARTED inside grace is not zombie-probed
new_runs_root started_grace
RUN="$(init_run REC-77-started "$SBX/wt-started" "$SBX/spec-started.md")"
run_reaper
[ "$(status_value "$RUN" state)" = "STARTED" ] && [ "$(status_value "$RUN" attempt)" = "1" ] \
  && ok "fresh STARTED within grace is left alone" \
  || no "fresh STARTED was zombie-probed"

# stale heartbeat + dead PID -> ZOMBIE_SUSPECT, then ZOMBIE_REAPED retry
new_runs_root zombie
RUN="$(init_run REC-77-zombie "$SBX/wt-zombie" "$SBX/spec-zombie.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_heartbeat "$RUN/attempt-001/heartbeat.json" "REC-77-zombie" 1 999999 "Mon Jan  1 00:00:00 2024" "dead-session" "$(old_iso)"
run_reaper
[ "$(status_value "$RUN" state)" = "ZOMBIE_SUSPECT" ] \
  && ok "stale dead heartbeat becomes ZOMBIE_SUSPECT" \
  || no "stale dead heartbeat did not become ZOMBIE_SUSPECT"
run_reaper
[ "$(status_value "$RUN" state)" = "ATTEMPT_RUNNING" ] && [ "$(status_value "$RUN" attempt)" = "2" ] \
  && ok "ZOMBIE_SUSPECT still dead is reaped and retried" \
  || no "ZOMBIE_SUSPECT was not retried"

# fresh heartbeat + confirmed-dead orchestrator -> prompt zombie suspicion and retry
new_runs_root fresh_confirmed_dead
RUN="$(init_run REC-77-fresh-dead "$SBX/wt-fresh-dead" "$SBX/spec-fresh-dead.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_heartbeat "$RUN/attempt-001/heartbeat.json" "REC-77-fresh-dead" 1 999999 "Mon Jan  1 00:00:00 2024" "gone-session" "$(fresh_iso)"
run_reaper
FIRST_STATE="$(status_value "$RUN" state)"
run_reaper
[ "$FIRST_STATE" = "ZOMBIE_SUSPECT" ] && [ "$(status_value "$RUN" state)" = "ATTEMPT_RUNNING" ] && [ "$(status_value "$RUN" attempt)" = "2" ] \
  && ok "fresh confirmed-dead heartbeat is promptly suspected and retried" \
  || no "fresh confirmed-dead heartbeat was not promptly retried"

# reused PID with different run identity is not false-alive
new_runs_root reused_pid
RUN="$(init_run REC-77-reused "$SBX/wt-reused" "$SBX/spec-reused.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
SESSION="manual-reused"
printf '%s\n' "$$" > "$TMUX_STATE/$SESSION.pid"
START="$(ps -p "$$" -o lstart= 2>/dev/null | sed 's/^ *//;s/ *$//')"
write_heartbeat "$RUN/attempt-001/heartbeat.json" "REC-77-reused" 1 "$$" "$START" "$SESSION" "$(old_iso)" "bash"
run_reaper
[ "$(status_value "$RUN" state)" = "ZOMBIE_SUSPECT" ] \
  && ok "reused PID with mismatched run identity is not alive" \
  || no "reused PID was treated as alive"

# crash mid re-dispatch: RETRY_PENDING resumes launch without another increment
new_runs_root retry_pending
RUN="$(init_run REC-77-pending "$SBX/wt-pending" "$SBX/spec-pending.md")"
python3 "$STATUS" retry-start --run-dir "$RUN" --failure-signature sha256:first >/dev/null
run_reaper
[ "$(status_value "$RUN" state)" = "ATTEMPT_RUNNING" ] && [ "$(status_value "$RUN" attempt)" = "2" ] \
  && ok "RETRY_PENDING resumes launch idempotently" \
  || no "RETRY_PENDING did not resume correctly"

# crash before ATTEMPT_RUNNING: stale STARTED past grace is startup-zombie retryable
new_runs_root startup_zombie
RUN="$(init_run REC-77-startup "$SBX/wt-startup" "$SBX/spec-startup.md")"
patch_status "$RUN" 'data["started_grace_until"]="2020-01-01T00:00:00Z"; data["updated_at"]="2020-01-01T00:00:00Z"'
run_reaper
[ "$(status_value "$RUN" state)" = "ATTEMPT_RUNNING" ] && [ "$(status_value "$RUN" attempt)" = "2" ] \
  && ok "stale STARTED past grace is retried as startup zombie" \
  || no "stale STARTED was not retried"

# watcher dies with killed orchestrator and stops refreshing heartbeat
new_runs_root watcher
RUN="$(init_run REC-77-watcher "$SBX/wt-watcher" "$SBX/spec-watcher.md")"
python3 "$STATUS" retry-start --run-dir "$RUN" --failure-signature sha256:watcher >/dev/null
run_reaper
HB="$RUN/attempt-002/heartbeat.json"
SESSION="REC-77-watcher-attempt-002"
PID="$(cat "$TMUX_STATE/$SESSION.pid")"
sleep 2
kill -9 "$PID" >/dev/null 2>&1 || true
sleep 1
MTIME_AFTER_KILL="$(mtime "$HB")"
sleep 2
MTIME_LATER="$(mtime "$HB")"
[ "$MTIME_AFTER_KILL" = "$MTIME_LATER" ] \
  && ok "heartbeat watcher stops touching after orchestrator kill" \
  || no "heartbeat watcher kept touching after orchestrator kill"
patch_status "$RUN" 'data["state"]="ATTEMPT_RUNNING"; data["attempt"]=2'
patch_status "$RUN" 'data["updated_at"]="2020-01-01T00:00:00Z"'
python3 - "$HB" <<'PY'
import json
import sys
with open(sys.argv[1], "r", encoding="utf-8") as handle:
    data = json.load(handle)
data["updated_at"] = "2020-01-01T00:00:00Z"
with open(sys.argv[1], "w", encoding="utf-8") as handle:
    json.dump(data, handle, indent=2, sort_keys=True)
    handle.write("\n")
PY
run_reaper
[ "$(status_value "$RUN" state)" = "ZOMBIE_SUSPECT" ] \
  && ok "killed orchestrator becomes detectable zombie" \
  || no "killed orchestrator was not detected as zombie"

# terminal converged run calls reap-one --worktree on the run's worktree
new_runs_root reap_converged
: > "$SWLOG"
RUN="$(init_run REC-77-rc "$SBX/wt-rc" "$SBX/spec-rc.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_terminal "$RUN/attempt-001/terminal_status.json" "" validation_failure CONVERGED 0
run_reaper
grep -q "reap-one --worktree $SBX/wt-rc" "$SWLOG" \
  && ok "converged terminal calls reap-one on worktree" || no "converged did not call reap-one"

# capped (deterministic) run also calls reap-one
new_runs_root reap_capped
: > "$SWLOG"
RUN="$(init_run REC-77-cap "$SBX/wt-cap" "$SBX/spec-cap.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
patch_status "$RUN" 'data["prior_failure_signatures"]=["sha256:cap"]'
write_terminal "$RUN/attempt-001/terminal_status.json" "sha256:cap"
run_reaper
grep -q "reap-one --worktree $SBX/wt-cap" "$SWLOG" \
  && ok "capped terminal calls reap-one on worktree" || no "capped did not call reap-one"

# capped dirty/unpushed: reap-one returns nonzero -> reaper keeps + notifies (no state change to non-terminal)
new_runs_root reap_capped_dirty
: > "$SWLOG"; : > "$NOTIFYLOG"
RUN="$(init_run REC-77-capd "$SBX/wt-capd" "$SBX/spec-capd.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
patch_status "$RUN" 'data["prior_failure_signatures"]=["sha256:capd"]'
write_terminal "$RUN/attempt-001/terminal_status.json" "sha256:capd"
SW_REAP_RC=1 run_reaper
[ "$(status_value "$RUN" state)" = "CAPPED_NEEDS_HUMAN" ] \
  && ok "unsafe reap keeps terminal state CAPPED_NEEDS_HUMAN" || no "unsafe reap changed terminal state"
# the recovery notification MUST fire when reap is unsafe (worktree-kept), else
# a kept worktree is silently dropped from the operator's view
grep -q "worktree-kept" "$NOTIFYLOG" \
  && ok "unsafe reap fires worktree-kept recovery notification" || no "unsafe reap dropped recovery notification"

# transient retry state must NOT call reap-one
new_runs_root reap_transient
: > "$SWLOG"
RUN="$(init_run REC-77-trans "$SBX/wt-trans" "$SBX/spec-trans.md")"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_terminal "$RUN/attempt-001/terminal_status.json" "sha256:transient2"
run_reaper
! grep -q "reap-one" "$SWLOG" \
  && ok "transient retry does NOT call reap-one" || no "transient path called reap-one"

# missing/non-executable SESSION_WORKSPACE must be observable, not silently success-shaped
new_runs_root reap_missing_sw
: > "$SWLOG"; : > "$NOTIFYLOG"
RUN="$(init_run REC-77-nosw "$SBX/wt-nosw" "$SBX/spec-nosw.md")"
mkdir -p "$SBX/wt-nosw"
python3 "$STATUS" transition --run-dir "$RUN" --state ATTEMPT_RUNNING >/dev/null
write_terminal "$RUN/attempt-001/terminal_status.json" "" validation_failure CONVERGED 0
OLD_SESSION_WORKSPACE="$SESSION_WORKSPACE"
SESSION_WORKSPACE="$SBX/missing-session-workspace.sh" run_reaper
SESSION_WORKSPACE="$OLD_SESSION_WORKSPACE"
! grep -q "reap-one" "$SWLOG" && grep -q "worktree-reap-skipped" "$NOTIFYLOG" \
  && ok "missing SESSION_WORKSPACE logs/notifies reap skip" || no "missing SESSION_WORKSPACE was silent"

# periodic sweep calls `reap` once per main() pass
new_runs_root reap_sweep
: > "$SWLOG"
run_reaper
test "$(grep -c '^reap --ttl-hours' "$SWLOG")" -eq 1 \
  && ok "periodic sweep calls reap once per pass" || no "periodic sweep call count wrong"

echo "--------"
echo "PASS=$PASS  FAIL=$FAIL"
[ "$FAIL" -eq 0 ]
