#!/usr/bin/env python3
"""consult_artifact_lint.py — Citation/path linter for consult artifacts.

A deterministic meta-gate for ungateable design artifacts (specs, consult
syntheses). The Claude x Codex pairing SYNTHESIS (decision b#11, build-order #3)
calls for a citation/path linter that runs BEFORE any model reviews a design
artifact, so neither engine wastes a turn on fabricated or stale file
citations.

v1 scope (THIS module): **path existence only**. Extract candidate file-path
citations from a markdown artifact and verify each one resolves on disk
relative to the repo root (or as an absolute path). Classify REAL vs MISSING.
Exit 0 if all cited paths resolve, exit 1 if any are MISSING — so it can gate.

## Not implemented (NOT v1 — deliberately out of scope; see SYNTHESIS b#11)
The SYNTHESIS describes a fuller ConsultArtifact gate. These are TODO, NOT here:
  - config-value grep-verification (every cited config value is grep-findable)
  - recommendation -> brief-question mapping (every recommendation maps to a
    question in the originating brief)
  - open_uncertainties presence (non-empty where claims could not be proven)
Path existence is the cheap, high-value half; the rest is left for a later
slice and intentionally not built here.

Usage:
    python3 consult_artifact_lint.py SYNTHESIS.md [more.md ...]
    python3 consult_artifact_lint.py --repo-root /path/to/repo doc.md
    python3 consult_artifact_lint.py --json doc.md
"""

import argparse
import json
import re
import sys
from pathlib import Path

# Repo root resolution mirrors consult.py's REPO_ROOT_LOCAL pattern:
# tools/ -> pipeline/ -> recoil/ -> repo root.
PIPELINE_ROOT_LOCAL = Path(__file__).parent.parent
RECOIL_ROOT_LOCAL = PIPELINE_ROOT_LOCAL.parent
REPO_ROOT_LOCAL = RECOIL_ROOT_LOCAL.parent

# A path-ish segment: at least one "/" and a final segment that looks like a
# file or directory name. Conservative: letters/digits/_/-/. plus / segments.
# We require at least one slash so bare words ("medium", "high") are never
# candidates. A trailing ":NNN" or ":NNN-MMM" line suffix is captured as part
# of the token and stripped before existence-checking.
_PATH_CHARS = r"[A-Za-z0-9_./\-]"
_LINE_SUFFIX = r"(?::\d+(?:-\d+)?)?"

# Inline-code spans: `...`. We only treat a span as a candidate if its full
# trimmed content looks like a single path token (no spaces inside).
_INLINE_CODE_RE = re.compile(r"`([^`\n]+)`")

# Bare paths in prose: a conservative path token bounded by whitespace or
# common punctuation. Must contain a "/" and a "." (file extension) to qualify
# as a bare-prose candidate — directories-only bare tokens are too ambiguous in
# prose, so we require an extension there; inline-code spans are looser.
_BARE_PATH_RE = re.compile(
    r"(?<![A-Za-z0-9_./\-])"  # left boundary: not mid-token
    rf"({_PATH_CHARS}+/{_PATH_CHARS}*\.{_PATH_CHARS}+{_LINE_SUFFIX})"
    r"(?![A-Za-z0-9_./\-])"  # right boundary
)

# Things that look like paths but are not repo file citations.
_URL_RE = re.compile(r"^[a-z][a-z0-9+.\-]*://", re.IGNORECASE)
# A full URL anywhere in a line (incl. inside backticks). Stripped before any
# path scanning so the bare-path regex can't latch onto a scheme-less remnant
# of the authority/path (e.g. "example.com/path/to/thing.html"). The run is
# bounded by whitespace OR a backtick so a stripped inline span doesn't consume
# its own closing backtick and desync later inline-code spans on the same line.
_URL_ANYWHERE_RE = re.compile(r"[a-z][a-z0-9+.\-]*://[^\s`]+", re.IGNORECASE)
# Templated / env-var path tokens anywhere in a line: "$VAR/...", "${...}/...",
# "~/...". Stripped before path scanning so the bare-path regex can't latch
# onto the scheme-less remainder (e.g. "$RECOIL_ROOT/_logs/x.json" leaking
# "RECOIL_ROOT/_logs/x.json"). Mirrors the URL pre-strip; bounded by backtick
# too (see above) so it never eats a closing backtick.
_TEMPLATE_PATH_RE = re.compile(r"(?:\$\{[^}]*\}|\$[A-Za-z_][A-Za-z0-9_]*|~)[^\s`]*")
# "a/b", "src/x" style git-diff placeholders and the like are real-ish, but the
# classic git-diff placeholders are exactly "a/" and "b/" prefixes; we don't
# special-case those beyond normal existence checking. The conservative filters
# below catch the genuine non-paths.

# Strip a trailing ':NNN' or ':NNN-MMM' line/range suffix before existence-check.
_LINE_SUFFIX_STRIP_RE = re.compile(r":\d+(?:-\d+)?$")


def _looks_like_path_token(token: str) -> bool:
    """True if `token` is plausibly a repo file/dir citation, not prose/URL.

    Conservative: bias toward NOT flagging (avoid false MISSING). A token
    qualifies only if it contains a "/", has no spaces, is not a URL, and is
    not an obvious placeholder.
    """
    if not token:
        return False
    if " " in token or "\t" in token:
        return False
    if "/" not in token:
        return False
    if _URL_RE.match(token):
        return False
    # Templated / env-var paths (`$RECOIL_ROOT/...`, `${VAR:-x}/...`,
    # `projects/{project}/...`, `~/Dropbox/...`) cannot resolve as literal
    # paths — checking them yields guaranteed false MISSING. Drop them.
    if any(c in token for c in ("$", "{", "}", "~")):
        return False
    # Bare scheme-less domains / emails / option strings sneak past the URL
    # check; reject tokens containing characters we never expect in repo paths.
    if any(c in token for c in ("@", "?", "#", "*", "(", ")", "[", "]")):
        return False
    # A bare hostname signature: the first slash-delimited segment contains a
    # "." while NOT being a leading-dot relative/dotfile prefix (".", "..",
    # ".github", etc.). Hostnames like "arxiv.org/..." are rejected; real
    # leading-dot paths (`../foo/bar.md`, `.github/workflows/ci.yml`) are kept.
    first_segment = token.split("/", 1)[0]
    if "." in first_segment and not first_segment.startswith("."):
        return False
    # CLI-flag pairs like "-c/--config" or "-p/--profile" contain a "/" but are
    # not paths. A repo path segment never starts with "-", and these have no
    # extension; reject any token with a leading "-" on either side of a "/".
    if any(seg.startswith("-") for seg in token.split("/")):
        return False
    # Require a path-shape signature so notation like "max/ultracode" (a slashed
    # word pair with no extension and no explicit dir marker) is not mistaken
    # for a file. Accept it only if it has a "." (file extension or dotfile) or
    # an explicit trailing "/" (directory). Conservative: drops ambiguous
    # slashed words rather than risk a false MISSING on a non-path.
    if "." not in token and not token.endswith("/"):
        return False
    return True


def _strip_line_suffix(token: str) -> str:
    """Strip a trailing ':NNN' line-number suffix before existence checking."""
    return _LINE_SUFFIX_STRIP_RE.sub("", token)


def extract_candidates(text):
    """Return [(token, line_number)] of candidate path citations in `text`.

    Scans each line for inline-code spans and bare-prose path tokens.
    De-dupes (token, line) pairs while preserving first-seen order.
    """
    candidates = []
    seen = set()

    for lineno, raw_line in enumerate(text.splitlines(), start=1):
        # Strip full URLs and templated/env-var paths first so neither scanner
        # can latch onto a scheme-less or var-stripped remnant.
        line = _URL_ANYWHERE_RE.sub(" ", raw_line)
        line = _TEMPLATE_PATH_RE.sub(" ", line)
        tokens = []

        # (a) inline-code spans: `path`
        for m in _INLINE_CODE_RE.finditer(line):
            inner = m.group(1).strip()
            tokens.append(inner)

        # (b) bare paths in prose (require an extension)
        for m in _BARE_PATH_RE.finditer(line):
            tokens.append(m.group(1))

        for tok in tokens:
            # Trim trailing prose punctuation that commonly abuts a citation.
            # A ":NNN" / ":NNN-MMM" line suffix ends in a digit, so it is never
            # touched by this strip; only outer prose punctuation is removed.
            tok = tok.rstrip(".,;:)]}\"'")
            if not _looks_like_path_token(tok):
                continue
            # De-dupe by normalized (suffix-stripped) path + line so a token and
            # its bare-regex inner-match of the same path collapse to one.
            key = (_strip_line_suffix(tok), lineno)
            if key in seen:
                continue
            seen.add(key)
            candidates.append((tok, lineno))

    return candidates


def classify(token, repo_root, artifact_dir=None):
    """Resolve `token` and report existence.

    Returns (status, resolved_path_str). Status is one of:
      - "REAL"    — the path exists (absolute, or relative to repo_root, or
                    relative to the artifact's own directory `artifact_dir`);
      - "MISSING" — the path does NOT exist, but its FIRST path segment names a
                    real directory under a search root (so the citation clearly
                    anchors into the repo and has gone stale / was fabricated —
                    incl. a fabricated deeper directory like
                    `recoil/not_a_real_dir/x.py`). This is the case worth
                    flagging.
      - "SKIP"    — the path does not exist AND its first segment is not a real
                    directory anywhere, so we cannot distinguish a fabricated
                    path from non-path notation (e.g. `getattr/dict.get`, a
                    diff's `a/recoil/...` prefix). Dropped to avoid false MISSING.

    Artifact-relative resolution exists because consult folders routinely cite
    sibling files (e.g. `research/foo.md`) relative to the artifact, not the
    repo root. The ':NNN' / ':NNN-MMM' suffix is stripped first.
    """
    bare = _strip_line_suffix(token)
    p = Path(bare)

    if p.is_absolute():
        if p.exists():
            return "REAL", str(p)
        # Absolute path: anchor on the first existing ancestor. If even the
        # root exists (it always does on a real FS) we'd over-flag, so require
        # the immediate parent to exist before calling it MISSING.
        if p.parent.exists():
            return "MISSING", str(p)
        return "SKIP", str(p)

    roots = [Path(repo_root)]
    if artifact_dir is not None:
        roots.append(Path(artifact_dir))

    first_segment = bare.split("/", 1)[0]
    anchored = False
    for root in roots:
        if (root / bare).exists():
            return "REAL", str(root / bare)
        # Does the citation's first segment name a real directory under a root?
        # If so the token anchors into the repo and a non-existent target is a
        # genuine stale/fabricated citation, even if a *deeper* dir is fake.
        if (root / first_segment).is_dir():
            anchored = True

    repo_resolved = Path(repo_root) / bare
    if anchored:
        return "MISSING", str(repo_resolved)
    # First segment is not a real dir anywhere -> not confidently a path. Drop.
    return "SKIP", str(repo_resolved)


def lint_file(md_path, repo_root):
    """Lint one markdown file. Returns a result dict."""
    md_path = Path(md_path)
    text = md_path.read_text(encoding="utf-8")
    candidates = extract_candidates(text)
    artifact_dir = md_path.resolve().parent

    citations = []
    missing = []
    for token, lineno in candidates:
        status, resolved = classify(token, repo_root, artifact_dir=artifact_dir)
        if status == "SKIP":
            # No anchoring parent dir -> not confidently a path citation. Drop.
            continue
        rec = {
            "token": token,
            "line": lineno,
            "status": status,
            "resolved": resolved,
        }
        citations.append(rec)
        if status == "MISSING":
            missing.append(rec)

    return {
        "file": str(md_path),
        "total": len(citations),
        "real": len(citations) - len(missing),
        "missing": len(missing),
        "citations": citations,
        "missing_citations": missing,
    }


def lint(paths, repo_root):
    """Lint multiple files. Returns (results, any_missing)."""
    results = [lint_file(p, repo_root) for p in paths]
    any_missing = any(r["missing"] for r in results)
    return results, any_missing


def _print_human(results):
    for r in results:
        print(
            f"\n{r['file']}: {r['total']} path citations, "
            f"{r['real']} REAL, {r['missing']} MISSING"
        )
        for rec in r["missing_citations"]:
            print(f"  MISSING  line {rec['line']}: {rec['token']}")
    total = sum(r["total"] for r in results)
    missing = sum(r["missing"] for r in results)
    print(
        f"\nTally: {total} cited paths, {missing} MISSING across {len(results)} file(s)"
    )


def main(argv=None):
    parser = argparse.ArgumentParser(
        description="Lint path citations in consult artifacts (v1: existence only).",
    )
    parser.add_argument("files", nargs="+", help="Markdown artifact(s) to lint.")
    parser.add_argument(
        "--repo-root",
        default=str(REPO_ROOT_LOCAL),
        help="Repo root to resolve relative citations against "
        "(default: derived repo root).",
    )
    parser.add_argument("--json", action="store_true", help="Emit JSON results.")
    args = parser.parse_args(argv)

    repo_root = Path(args.repo_root)
    results, any_missing = lint(args.files, repo_root)

    if args.json:
        print(
            json.dumps(
                {
                    "repo_root": str(repo_root),
                    "any_missing": any_missing,
                    "results": results,
                },
                indent=2,
            )
        )
    else:
        _print_human(results)

    return 1 if any_missing else 0


if __name__ == "__main__":
    sys.exit(main())
