Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 70 additions & 8 deletions agent_eval/build_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ def load_gold(manifest_path: Path) -> dict[str, Gold]:
_PIPELINE_SEP = re.compile(r"(?:;|&&|\|\||\||\n|\$\(|`)")
_ENV_ASSIGN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
_WRAPPERS = {"sudo", "time", "nice", "nohup", "exec", "env", "command", "builtin"}
# Leading shell keywords that precede a command inside control flow, e.g.
# `if [ -n "$RETRIEVER_VENV" ]; then "$RETRIEVER_VENV/bin/retriever" query ...`
# or `... fi "$RETRIEVER_BIN" query ...`. After splitting, the retriever segment may
# start with `then`/`else`/`fi`/`done` etc., which must be stripped or the head token
# reads as the keyword, not the command.
_SHELL_KW = {"then", "do", "else", "elif", "fi", "done", "{"}
_TIMEOUT_VAL_FLAGS = {"-k", "--kill-after", "-s", "--signal"}
_PARSE_ERR = re.compile(r"pdf_basename|JSONDecodeError|Extra data|_default_decoder|KeyError", re.I)
# The baseline profile installs a PATH shim that prints this and exits 127. A
Expand All @@ -159,7 +165,7 @@ def _strip_wrappers(seg: str) -> list[str]:
if i < len(toks): # the DURATION token
i += 1
continue
if t in _WRAPPERS:
if t in _WRAPPERS or t in _SHELL_KW:
i += 1
continue
break
Expand All @@ -170,7 +176,10 @@ def _seg_is_retriever(seg: str) -> bool:
toks = _strip_wrappers(seg.strip())
if not toks:
return False
h = toks[0]
# Strip surrounding quotes so a guarded/quoted binary path like
# `"$RETRIEVER_VENV/bin/retriever"` is recognized (the var stays unexpanded,
# but the literal still ends in `/retriever`).
h = toks[0].strip("'\"")
if h == "retriever" or h.endswith("/retriever"):
return True
if len(toks) >= 3 and toks[0] == "uv" and toks[1] == "run" and toks[2] == "retriever":
Expand All @@ -180,8 +189,33 @@ def _seg_is_retriever(seg: str) -> bool:
return False


# Var names assigned from a resolved retriever binary, e.g.
# `RETRIEVER_BIN="$(command -v retriever)"`, `RETRIEVER_BIN="$RETRIEVER_VENV/bin/retriever"`,
# or `RETRIEVER=retriever`. The agent builds these to harden against an unset
# RETRIEVER_VENV, then invokes `"$RETRIEVER_BIN" query ...` — whose head is a $var,
# not a `retriever` literal, so it slips past _seg_is_retriever.
_RETR_VAR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)=[^\n;]*\bretriever\b")
_VAR_REF = re.compile(r"^\$\{?([A-Za-z_]\w*)\}?$")


def _retriever_bin_vars(cmd: str) -> set[str]:
return set(_RETR_VAR_ASSIGN.findall(cmd or ""))


def cmd_uses_retriever(cmd: str) -> bool:
return any(_seg_is_retriever(s) for s in _PIPELINE_SEP.split(cmd or ""))
segs = _PIPELINE_SEP.split(cmd or "")
if any(_seg_is_retriever(s) for s in segs):
return True
# Variable indirection: VAR=<...retriever...> earlier, then `"$VAR" <subcommand>`.
rvars = _retriever_bin_vars(cmd)
if rvars:
for s in segs:
toks = _strip_wrappers(s.strip())
if len(toks) >= 2: # head is the binary, plus at least a subcommand
m = _VAR_REF.match(toks[0].strip("'\""))
if m and m.group(1) in rvars:
return True
return False


def _retriever_piped_to_parser(cmd: str) -> bool:
Expand All @@ -191,6 +225,10 @@ def _retriever_piped_to_parser(cmd: str) -> bool:

_CODEX_EXIT_RE = re.compile(r"exited with code (\d+)")
_HITS_JSON_RE = re.compile(r'"page_number"')
# Codex backgrounds a slow command (~1s yield) → output says "Process running with
# session ID <n>"; the agent then polls it via a function_call whose arguments carry
# {"session_id": <n>}. The clean exit lands on that poll, not the original query call.
_BG_SESSION_RE = re.compile(r"running with session(?:\s+ID)?\s+(\d+)", re.I)


def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
Expand All @@ -207,6 +245,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
return {"attempted": False, "clean": False, "engine": False}
calls: dict[str, str] = {}
outs: dict[str, str] = {}
polls: dict[str, str] = {} # call_id -> polled session_id (background-continue calls)
hits_seen = False
for line in agent_log.read_text().splitlines():
if not line.strip():
Expand All @@ -227,6 +266,9 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
if isinstance(cmd, list):
cmd = " ".join(str(x) for x in cmd)
calls[p.get("call_id")] = str(cmd)
sid = a.get("session_id")
if sid is not None:
polls[p.get("call_id")] = str(sid)
elif p.get("type") == "function_call_output":
o = p.get("output")
if isinstance(o, dict):
Expand All @@ -236,6 +278,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
if _HITS_JSON_RE.search(o) and ('"source"' in o or '"text"' in o):
hits_seen = True
attempted = clean = engine = False
retr_sessions: set[str] = set() # session IDs opened by a backgrounded retriever query
for cid, cmd in calls.items():
if not cmd_uses_retriever(cmd):
continue
Expand All @@ -247,11 +290,30 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
if m and m.group(1) == "0":
clean = True
engine = True
# Codex backgrounds `retriever query` (1s yield), so its hits often arrive in a
# later polled output rather than a clean exit. Credit that to engine — but ONLY
# when a real retriever-query command was attempted, so direct LanceDB pandas
# reads (which also emit page_number/source/text) aren't miscounted. Guarantees
# engine ⊆ attempted.
bg = _BG_SESSION_RE.search(out) # query backgrounded → remember its session
if bg:
retr_sessions.add(bg.group(1))
# A backgrounded `retriever query` finishes in a later session-poll, not the original
# call — credit that poll's clean exit to the query. Follow re-yields (a poll may
# background again under a new session id) to a fixpoint, bounded by #polls.
if retr_sessions and not clean:
changed = True
while changed and not clean:
changed = False
for cid, sid in polls.items():
if sid not in retr_sessions:
continue
out = outs.get(cid, "")
m = _CODEX_EXIT_RE.search(out)
if m and m.group(1) == "0":
clean = engine = True
break
nb = _BG_SESSION_RE.search(out)
if nb and nb.group(1) not in retr_sessions:
retr_sessions.add(nb.group(1))
changed = True
# Hits-JSON fallback (query returned results even if no clean exit was captured),
# gated on a real retriever attempt so direct LanceDB reads aren't miscounted.
if attempted and hits_seen:
engine = True
return {"attempted": attempted, "clean": clean, "engine": engine}
Expand Down
5 changes: 5 additions & 0 deletions agent_eval/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,13 @@ def build_query_workdir(*, base_dir: Path, query_dir: Path, profile: str, agent:
"""Create a per-query workdir that symlinks the shared base contents."""
wd = query_dir / "workdir"
wd.mkdir(parents=True, exist_ok=True)
# AGENT_EVAL_NO_PDFS=1 withholds the raw ./pdfs from the agent so it cannot
# fall back to reading source files (pdftotext/pypdf) — forces retriever use.
no_pdfs = os.environ.get("AGENT_EVAL_NO_PDFS") == "1"
# Shared, read-only-ish artifacts: symlink to the base.
for name in ("pdfs", "lancedb"):
if name == "pdfs" and no_pdfs:
continue
src = base_dir / name
dst = wd / name
if src.exists() and not dst.exists():
Expand Down
105 changes: 105 additions & 0 deletions nemo_retriever/src/nemo_retriever/cli/evidence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Answer-ready ``{evidence, coverage}`` shaping for ``retriever query --format evidence``.

The skill reasons over this shape: each evidence item is fidelity-tagged and
citation-ready, and ``coverage`` summarizes what was searched and flags thin spots.
``--format evidence`` is opt-in; ``query``'s default output stays the flat hit list.
"""

from __future__ import annotations

import os
from typing import Any

from nemo_retriever.common.vdb.records import _derive_fidelity

_KNOWN_MODALITIES = {"text", "table", "chart", "image", "audio", "video_frame"}


def _normalize_modality(value: Any) -> str:
m = str(value or "text").lower()
if m in _KNOWN_MODALITIES:
return m
if m.startswith("table"):
return "table"
if m.startswith("chart"):
return "chart"
if m.startswith(("image", "infographic")):
return "image"
if m.startswith("video"):
return "video_frame"
if m.startswith("audio"):
return "audio"
return "text"


def _evidence_item(hit: dict[str, Any]) -> dict[str, Any]:
meta = hit.get("metadata") if isinstance(hit.get("metadata"), dict) else {}
src_raw = hit.get("pdf_basename") or hit.get("source") or ""
source = os.path.basename(str(src_raw))
if source.lower().endswith(".pdf"):
source = source[:-4]
raw_modality = hit.get("content_type") or meta.get("type") or "text"
modality = _normalize_modality(raw_modality)

page = hit.get("page_number")
if page is not None:
locator = {"kind": "page", "value": page}
citation = f"{source} p.{page}"
elif meta.get("segment_start_seconds") is not None:
locator = {"kind": "segment", "value": meta["segment_start_seconds"]}
citation = f"{source} @{meta['segment_start_seconds']}"
elif meta.get("frame_timestamp_seconds") is not None:
locator = {"kind": "timestamp", "value": meta["frame_timestamp_seconds"]}
citation = f"{source} @{meta['frame_timestamp_seconds']}"
elif meta.get("bbox_xyxy_norm") is not None:
locator = {"kind": "bbox", "value": meta["bbox_xyxy_norm"]}
citation = source
else:
locator = {"kind": "page", "value": None}
citation = source

fidelity = meta.get("fidelity") or _derive_fidelity(raw_modality, meta, meta) or "verbatim"

if "_score" in hit and hit["_score"] is not None:
score: float = hit["_score"]
elif "_distance" in hit and hit["_distance"] is not None:
score = hit["_distance"]
else:
score = 0.0

return {
"text": hit.get("text", ""),
"source": source,
"locator": locator,
"modality": modality,
"fidelity": fidelity,
"score": score,
"citation": citation,
}


def build_evidence_result(hits: list, strategies_used: list[str]) -> dict[str, Any]:
"""Assemble the answer-ready ``{evidence, coverage}`` contract shape from raw hits.

``evidence`` items are fidelity-tagged and citation-ready; ``coverage`` summarizes
what was searched (``strategies_used``, ``n_docs_seen``) and flags thin spots
(single source, low-fidelity-only, out-of-corpus). This is the shape the skill
reasons over — emitted by ``retriever query --format evidence``.
"""
evidence = [_evidence_item(h) for h in (hits or [])]
sources = {e["source"] for e in evidence if e.get("source")}
thin: list[str] = []
if not evidence:
thin.append("no matches — likely out of corpus")
else:
if len(sources) == 1:
thin.append("single source")
if all(e["fidelity"] == "vlm_caption" for e in evidence):
thin.append("only low-fidelity (chart/image) evidence")
return {
"evidence": evidence,
"coverage": {"strategies_used": strategies_used, "n_docs_seen": len(sources), "thin_spots": thin},
}
Loading
Loading