NVIDIA · edknv · Jun 7, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 15, 2026
@@ -136,6 +136,12 @@ def load_gold(manifest_path: Path) -> dict[str, Gold]:
 _PIPELINE_SEP = re.compile(r"(?:;|&&|\|\||\||\n|\$\(|`)")
 _ENV_ASSIGN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
 _WRAPPERS = {"sudo", "time", "nice", "nohup", "exec", "env", "command", "builtin"}
+# Leading shell keywords that precede a command inside control flow, e.g.
+# `if [ -n "$RETRIEVER_VENV" ]; then "$RETRIEVER_VENV/bin/retriever" query ...`
+# or `... fi "$RETRIEVER_BIN" query ...`. After splitting, the retriever segment may
+# start with `then`/`else`/`fi`/`done` etc., which must be stripped or the head token
+# reads as the keyword, not the command.
+_SHELL_KW = {"then", "do", "else", "elif", "fi", "done", "{"}
 _TIMEOUT_VAL_FLAGS = {"-k", "--kill-after", "-s", "--signal"}
 _PARSE_ERR = re.compile(r"pdf_basename|JSONDecodeError|Extra data|_default_decoder|KeyError", re.I)
 # The baseline profile installs a PATH shim that prints this and exits 127. A
@@ -159,7 +165,7 @@ def _strip_wrappers(seg: str) -> list[str]:
             if i < len(toks):  # the DURATION token
                 i += 1
             continue
-        if t in _WRAPPERS:
+        if t in _WRAPPERS or t in _SHELL_KW:
             i += 1
             continue
         break
@@ -170,7 +176,10 @@ def _seg_is_retriever(seg: str) -> bool:
     toks = _strip_wrappers(seg.strip())
     if not toks:
         return False
-    h = toks[0]
+    # Strip surrounding quotes so a guarded/quoted binary path like
+    # `"$RETRIEVER_VENV/bin/retriever"` is recognized (the var stays unexpanded,
+    # but the literal still ends in `/retriever`).
+    h = toks[0].strip("'\"")
     if h == "retriever" or h.endswith("/retriever"):
         return True
     if len(toks) >= 3 and toks[0] == "uv" and toks[1] == "run" and toks[2] == "retriever":
@@ -180,8 +189,33 @@ def _seg_is_retriever(seg: str) -> bool:
     return False
 
 
+# Var names assigned from a resolved retriever binary, e.g.
+# `RETRIEVER_BIN="$(command -v retriever)"`, `RETRIEVER_BIN="$RETRIEVER_VENV/bin/retriever"`,
+# or `RETRIEVER=retriever`. The agent builds these to harden against an unset
+# RETRIEVER_VENV, then invokes `"$RETRIEVER_BIN" query ...` — whose head is a $var,
+# not a `retriever` literal, so it slips past _seg_is_retriever.
+_RETR_VAR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)=[^\n;]*\bretriever\b")
+_VAR_REF = re.compile(r"^\$\{?([A-Za-z_]\w*)\}?$")
+
+
+def _retriever_bin_vars(cmd: str) -> set[str]:
+    return set(_RETR_VAR_ASSIGN.findall(cmd or ""))
+
+
 def cmd_uses_retriever(cmd: str) -> bool:
-    return any(_seg_is_retriever(s) for s in _PIPELINE_SEP.split(cmd or ""))
+    segs = _PIPELINE_SEP.split(cmd or "")
+    if any(_seg_is_retriever(s) for s in segs):
+        return True
+    # Variable indirection: VAR=<...retriever...> earlier, then `"$VAR" <subcommand>`.
+    rvars = _retriever_bin_vars(cmd)
+    if rvars:
+        for s in segs:
+            toks = _strip_wrappers(s.strip())
+            if len(toks) >= 2:  # head is the binary, plus at least a subcommand
+                m = _VAR_REF.match(toks[0].strip("'\""))
+                if m and m.group(1) in rvars:
+                    return True
+    return False
 
 
 def _retriever_piped_to_parser(cmd: str) -> bool:
@@ -191,6 +225,10 @@ def _retriever_piped_to_parser(cmd: str) -> bool:
 
 _CODEX_EXIT_RE = re.compile(r"exited with code (\d+)")
 _HITS_JSON_RE = re.compile(r'"page_number"')
+# Codex backgrounds a slow command (~1s yield) → output says "Process running with
+# session ID <n>"; the agent then polls it via a function_call whose arguments carry
+# {"session_id": <n>}. The clean exit lands on that poll, not the original query call.
+_BG_SESSION_RE = re.compile(r"running with session(?:\s+ID)?\s+(\d+)", re.I)
 
 
 def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
@@ -207,6 +245,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
         return {"attempted": False, "clean": False, "engine": False}
     calls: dict[str, str] = {}
     outs: dict[str, str] = {}
+    polls: dict[str, str] = {}  # call_id -> polled session_id (background-continue calls)
     hits_seen = False
     for line in agent_log.read_text().splitlines():
         if not line.strip():
@@ -227,6 +266,9 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
             if isinstance(cmd, list):
                 cmd = " ".join(str(x) for x in cmd)
             calls[p.get("call_id")] = str(cmd)
+            sid = a.get("session_id")
+            if sid is not None:
+                polls[p.get("call_id")] = str(sid)
         elif p.get("type") == "function_call_output":
             o = p.get("output")
             if isinstance(o, dict):
@@ -236,6 +278,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
             if _HITS_JSON_RE.search(o) and ('"source"' in o or '"text"' in o):
                 hits_seen = True
     attempted = clean = engine = False
+    retr_sessions: set[str] = set()  # session IDs opened by a backgrounded retriever query
     for cid, cmd in calls.items():
         if not cmd_uses_retriever(cmd):
             continue
@@ -247,11 +290,30 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
         if m and m.group(1) == "0":
             clean = True
             engine = True
-    # Codex backgrounds `retriever query` (1s yield), so its hits often arrive in a
-    # later polled output rather than a clean exit. Credit that to engine — but ONLY
-    # when a real retriever-query command was attempted, so direct LanceDB pandas
-    # reads (which also emit page_number/source/text) aren't miscounted. Guarantees
-    # engine ⊆ attempted.
+        bg = _BG_SESSION_RE.search(out)  # query backgrounded → remember its session
+        if bg:
+            retr_sessions.add(bg.group(1))
+    # A backgrounded `retriever query` finishes in a later session-poll, not the original
+    # call — credit that poll's clean exit to the query. Follow re-yields (a poll may
+    # background again under a new session id) to a fixpoint, bounded by #polls.
+    if retr_sessions and not clean:
+        changed = True
+        while changed and not clean:
+            changed = False
+            for cid, sid in polls.items():
+                if sid not in retr_sessions:
+                    continue
+                out = outs.get(cid, "")
+                m = _CODEX_EXIT_RE.search(out)
+                if m and m.group(1) == "0":
+                    clean = engine = True
+                    break
+                nb = _BG_SESSION_RE.search(out)
+                if nb and nb.group(1) not in retr_sessions:
+                    retr_sessions.add(nb.group(1))
+                    changed = True
+    # Hits-JSON fallback (query returned results even if no clean exit was captured),
+    # gated on a real retriever attempt so direct LanceDB reads aren't miscounted.
     if attempted and hits_seen:
         engine = True
     return {"attempted": attempted, "clean": clean, "engine": engine}

@@ -186,8 +186,13 @@ def build_query_workdir(*, base_dir: Path, query_dir: Path, profile: str, agent:
     """Create a per-query workdir that symlinks the shared base contents."""
     wd = query_dir / "workdir"
     wd.mkdir(parents=True, exist_ok=True)
+    # AGENT_EVAL_NO_PDFS=1 withholds the raw ./pdfs from the agent so it cannot
+    # fall back to reading source files (pdftotext/pypdf) — forces retriever use.
+    no_pdfs = os.environ.get("AGENT_EVAL_NO_PDFS") == "1"
     # Shared, read-only-ish artifacts: symlink to the base.
     for name in ("pdfs", "lancedb"):
+        if name == "pdfs" and no_pdfs:
+            continue
         src = base_dir / name
         dst = wd / name
         if src.exists() and not dst.exists():

@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Answer-ready ``{evidence, coverage}`` shaping for ``retriever query --format evidence``.
+
+The skill reasons over this shape: each evidence item is fidelity-tagged and
+citation-ready, and ``coverage`` summarizes what was searched and flags thin spots.
+``--format evidence`` is opt-in; ``query``'s default output stays the flat hit list.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from nemo_retriever.common.vdb.records import _derive_fidelity
+
+_KNOWN_MODALITIES = {"text", "table", "chart", "image", "audio", "video_frame"}
+
+
+def _normalize_modality(value: Any) -> str:
+    m = str(value or "text").lower()
+    if m in _KNOWN_MODALITIES:
+        return m
+    if m.startswith("table"):
+        return "table"
+    if m.startswith("chart"):
+        return "chart"
+    if m.startswith(("image", "infographic")):
+        return "image"
+    if m.startswith("video"):
+        return "video_frame"
+    if m.startswith("audio"):
+        return "audio"
+    return "text"
+
+
+def _evidence_item(hit: dict[str, Any]) -> dict[str, Any]:
+    meta = hit.get("metadata") if isinstance(hit.get("metadata"), dict) else {}
+    src_raw = hit.get("pdf_basename") or hit.get("source") or ""
+    source = os.path.basename(str(src_raw))
+    if source.lower().endswith(".pdf"):
+        source = source[:-4]
+    raw_modality = hit.get("content_type") or meta.get("type") or "text"
+    modality = _normalize_modality(raw_modality)
+
+    page = hit.get("page_number")
+    if page is not None:
+        locator = {"kind": "page", "value": page}
+        citation = f"{source} p.{page}"
+    elif meta.get("segment_start_seconds") is not None:
+        locator = {"kind": "segment", "value": meta["segment_start_seconds"]}
+        citation = f"{source} @{meta['segment_start_seconds']}"
+    elif meta.get("frame_timestamp_seconds") is not None:
+        locator = {"kind": "timestamp", "value": meta["frame_timestamp_seconds"]}
+        citation = f"{source} @{meta['frame_timestamp_seconds']}"
+    elif meta.get("bbox_xyxy_norm") is not None:
+        locator = {"kind": "bbox", "value": meta["bbox_xyxy_norm"]}
+        citation = source
+    else:
+        locator = {"kind": "page", "value": None}
+        citation = source
+
+    fidelity = meta.get("fidelity") or _derive_fidelity(raw_modality, meta, meta) or "verbatim"
+
+    if "_score" in hit and hit["_score"] is not None:
+        score: float = hit["_score"]
+    elif "_distance" in hit and hit["_distance"] is not None:
+        score = hit["_distance"]
+    else:
+        score = 0.0
+
+    return {
+        "text": hit.get("text", ""),
+        "source": source,
+        "locator": locator,
+        "modality": modality,
+        "fidelity": fidelity,
+        "score": score,
+        "citation": citation,
+    }
+
+
+def build_evidence_result(hits: list, strategies_used: list[str]) -> dict[str, Any]:
+    """Assemble the answer-ready ``{evidence, coverage}`` contract shape from raw hits.
+
+    ``evidence`` items are fidelity-tagged and citation-ready; ``coverage`` summarizes
+    what was searched (``strategies_used``, ``n_docs_seen``) and flags thin spots
+    (single source, low-fidelity-only, out-of-corpus). This is the shape the skill
+    reasons over — emitted by ``retriever query --format evidence``.
+    """
+    evidence = [_evidence_item(h) for h in (hits or [])]
+    sources = {e["source"] for e in evidence if e.get("source")}
+    thin: list[str] = []
+    if not evidence:
+        thin.append("no matches — likely out of corpus")
+    else:
+        if len(sources) == 1:
+            thin.append("single source")
+        if all(e["fidelity"] == "vlm_caption" for e in evidence):
+            thin.append("only low-fidelity (chart/image) evidence")
+    return {
+        "evidence": evidence,
+        "coverage": {"strategies_used": strategies_used, "n_docs_seen": len(sources), "thin_spots": thin},
+    }