diff --git a/agent_eval/build_report.py b/agent_eval/build_report.py index 461614d813..83fc866c5f 100644 --- a/agent_eval/build_report.py +++ b/agent_eval/build_report.py @@ -10,7 +10,7 @@ when several runs are passed. Unlike the runner, this script MAY import ``nemo_retriever`` to reuse -``score.recall_at_k`` and ``llm.clients.judge.LLMJudge`` (it runs where the +``score.recall_at_k`` and ``models.llm.clients.judge.LLMJudge`` (it runs where the codebase exists). Both imports degrade gracefully if unavailable. Usage: @@ -136,6 +136,12 @@ def load_gold(manifest_path: Path) -> dict[str, Gold]: _PIPELINE_SEP = re.compile(r"(?:;|&&|\|\||\||\n|\$\(|`)") _ENV_ASSIGN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=") _WRAPPERS = {"sudo", "time", "nice", "nohup", "exec", "env", "command", "builtin"} +# Leading shell keywords that precede a command inside control flow, e.g. +# `if [ -n "$RETRIEVER_VENV" ]; then "$RETRIEVER_VENV/bin/retriever" query ...` +# or `... fi "$RETRIEVER_BIN" query ...`. After splitting, the retriever segment may +# start with `then`/`else`/`fi`/`done` etc., which must be stripped or the head token +# reads as the keyword, not the command. +_SHELL_KW = {"then", "do", "else", "elif", "fi", "done", "{"} _TIMEOUT_VAL_FLAGS = {"-k", "--kill-after", "-s", "--signal"} _PARSE_ERR = re.compile(r"pdf_basename|JSONDecodeError|Extra data|_default_decoder|KeyError", re.I) # The baseline profile installs a PATH shim that prints this and exits 127. A @@ -159,7 +165,7 @@ def _strip_wrappers(seg: str) -> list[str]: if i < len(toks): # the DURATION token i += 1 continue - if t in _WRAPPERS: + if t in _WRAPPERS or t in _SHELL_KW: i += 1 continue break @@ -170,7 +176,10 @@ def _seg_is_retriever(seg: str) -> bool: toks = _strip_wrappers(seg.strip()) if not toks: return False - h = toks[0] + # Strip surrounding quotes so a guarded/quoted binary path like + # `"$RETRIEVER_VENV/bin/retriever"` is recognized (the var stays unexpanded, + # but the literal still ends in `/retriever`). + h = toks[0].strip("'\"") if h == "retriever" or h.endswith("/retriever"): return True if len(toks) >= 3 and toks[0] == "uv" and toks[1] == "run" and toks[2] == "retriever": @@ -180,8 +189,33 @@ def _seg_is_retriever(seg: str) -> bool: return False +# Var names assigned from a resolved retriever binary, e.g. +# `RETRIEVER_BIN="$(command -v retriever)"`, `RETRIEVER_BIN="$RETRIEVER_VENV/bin/retriever"`, +# or `RETRIEVER=retriever`. The agent builds these to harden against an unset +# RETRIEVER_VENV, then invokes `"$RETRIEVER_BIN" query ...` — whose head is a $var, +# not a `retriever` literal, so it slips past _seg_is_retriever. +_RETR_VAR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)=[^\n;]*\bretriever\b") +_VAR_REF = re.compile(r"^\$\{?([A-Za-z_]\w*)\}?$") + + +def _retriever_bin_vars(cmd: str) -> set[str]: + return set(_RETR_VAR_ASSIGN.findall(cmd or "")) + + def cmd_uses_retriever(cmd: str) -> bool: - return any(_seg_is_retriever(s) for s in _PIPELINE_SEP.split(cmd or "")) + segs = _PIPELINE_SEP.split(cmd or "") + if any(_seg_is_retriever(s) for s in segs): + return True + # Variable indirection: VAR=<...retriever...> earlier, then `"$VAR" `. + rvars = _retriever_bin_vars(cmd) + if rvars: + for s in segs: + toks = _strip_wrappers(s.strip()) + if len(toks) >= 2: # head is the binary, plus at least a subcommand + m = _VAR_REF.match(toks[0].strip("'\"")) + if m and m.group(1) in rvars: + return True + return False def _retriever_piped_to_parser(cmd: str) -> bool: @@ -191,6 +225,10 @@ def _retriever_piped_to_parser(cmd: str) -> bool: _CODEX_EXIT_RE = re.compile(r"exited with code (\d+)") _HITS_JSON_RE = re.compile(r'"page_number"') +# Codex backgrounds a slow command (~1s yield) → output says "Process running with +# session ID "; the agent then polls it via a function_call whose arguments carry +# {"session_id": }. The clean exit lands on that poll, not the original query call. +_BG_SESSION_RE = re.compile(r"running with session(?:\s+ID)?\s+(\d+)", re.I) def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]: @@ -207,6 +245,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]: return {"attempted": False, "clean": False, "engine": False} calls: dict[str, str] = {} outs: dict[str, str] = {} + polls: dict[str, str] = {} # call_id -> polled session_id (background-continue calls) hits_seen = False for line in agent_log.read_text().splitlines(): if not line.strip(): @@ -227,6 +266,9 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]: if isinstance(cmd, list): cmd = " ".join(str(x) for x in cmd) calls[p.get("call_id")] = str(cmd) + sid = a.get("session_id") + if sid is not None: + polls[p.get("call_id")] = str(sid) elif p.get("type") == "function_call_output": o = p.get("output") if isinstance(o, dict): @@ -236,6 +278,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]: if _HITS_JSON_RE.search(o) and ('"source"' in o or '"text"' in o): hits_seen = True attempted = clean = engine = False + retr_sessions: set[str] = set() # session IDs opened by a backgrounded retriever query for cid, cmd in calls.items(): if not cmd_uses_retriever(cmd): continue @@ -247,11 +290,30 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]: if m and m.group(1) == "0": clean = True engine = True - # Codex backgrounds `retriever query` (1s yield), so its hits often arrive in a - # later polled output rather than a clean exit. Credit that to engine — but ONLY - # when a real retriever-query command was attempted, so direct LanceDB pandas - # reads (which also emit page_number/source/text) aren't miscounted. Guarantees - # engine ⊆ attempted. + bg = _BG_SESSION_RE.search(out) # query backgrounded → remember its session + if bg: + retr_sessions.add(bg.group(1)) + # A backgrounded `retriever query` finishes in a later session-poll, not the original + # call — credit that poll's clean exit to the query. Follow re-yields (a poll may + # background again under a new session id) to a fixpoint, bounded by #polls. + if retr_sessions and not clean: + changed = True + while changed and not clean: + changed = False + for cid, sid in polls.items(): + if sid not in retr_sessions: + continue + out = outs.get(cid, "") + m = _CODEX_EXIT_RE.search(out) + if m and m.group(1) == "0": + clean = engine = True + break + nb = _BG_SESSION_RE.search(out) + if nb and nb.group(1) not in retr_sessions: + retr_sessions.add(nb.group(1)) + changed = True + # Hits-JSON fallback (query returned results even if no clean exit was captured), + # gated on a real retriever attempt so direct LanceDB reads aren't miscounted. if attempted and hits_seen: engine = True return {"attempted": attempted, "clean": clean, "engine": engine} @@ -398,7 +460,7 @@ def build_judge(model: str, api_base: str | None, api_key_env: str): print(f" judge disabled: ${api_key_env} not set", file=sys.stderr) return None try: - from nemo_retriever.llm.clients.judge import LLMJudge # type: ignore + from nemo_retriever.models.llm.clients.judge import LLMJudge # type: ignore except Exception as exc: # noqa: BLE001 print(f" judge disabled: cannot import LLMJudge ({exc})", file=sys.stderr) return None @@ -415,7 +477,13 @@ def _load_judge_cache(run_dir: Path) -> dict[str, tuple]: r = json.loads(rp.read_text()) except Exception: return {} - return {q["query_id"]: (q.get("judge_score"), q.get("judge_error", "")) for q in r.get("per_query", [])} + # Only reuse SUCCESSFUL judgements; a prior run that scored None (e.g. judge + # import broken / disabled) must not poison re-judging into skipping forever. + return { + q["query_id"]: (q.get("judge_score"), q.get("judge_error", "")) + for q in r.get("per_query", []) + if q.get("judge_score") is not None + } def apply_judge( diff --git a/agent_eval/profiles.py b/agent_eval/profiles.py index e68d52d52c..311796b84c 100644 --- a/agent_eval/profiles.py +++ b/agent_eval/profiles.py @@ -186,8 +186,13 @@ def build_query_workdir(*, base_dir: Path, query_dir: Path, profile: str, agent: """Create a per-query workdir that symlinks the shared base contents.""" wd = query_dir / "workdir" wd.mkdir(parents=True, exist_ok=True) + # AGENT_EVAL_NO_PDFS=1 withholds the raw ./pdfs from the agent so it cannot + # fall back to reading source files (pdftotext/pypdf) — forces retriever use. + no_pdfs = os.environ.get("AGENT_EVAL_NO_PDFS") == "1" # Shared, read-only-ish artifacts: symlink to the base. for name in ("pdfs", "lancedb"): + if name == "pdfs" and no_pdfs: + continue src = base_dir / name dst = wd / name if src.exists() and not dst.exists(): diff --git a/nemo_retriever/src/nemo_retriever/cli/evidence.py b/nemo_retriever/src/nemo_retriever/cli/evidence.py new file mode 100644 index 0000000000..505827becc --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/cli/evidence.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Answer-ready ``{evidence, coverage}`` shaping for ``retriever query --format evidence``. + +The skill reasons over this shape: each evidence item is fidelity-tagged and +citation-ready, and ``coverage`` summarizes what was searched and flags thin spots. +``--format evidence`` is opt-in; ``query``'s default output stays the flat hit list. +""" + +from __future__ import annotations + +import os +from typing import Any + +from nemo_retriever.common.vdb.records import _derive_fidelity + +_KNOWN_MODALITIES = {"text", "table", "chart", "image", "audio", "video_frame"} + + +def _normalize_modality(value: Any) -> str: + m = str(value or "text").lower() + if m in _KNOWN_MODALITIES: + return m + if m.startswith("table"): + return "table" + if m.startswith("chart"): + return "chart" + if m.startswith(("image", "infographic")): + return "image" + if m.startswith("video"): + return "video_frame" + if m.startswith("audio"): + return "audio" + return "text" + + +def _evidence_item(hit: dict[str, Any]) -> dict[str, Any]: + meta = hit.get("metadata") if isinstance(hit.get("metadata"), dict) else {} + src_raw = hit.get("pdf_basename") or hit.get("source") or "" + source = os.path.basename(str(src_raw)) + if source.lower().endswith(".pdf"): + source = source[:-4] + raw_modality = hit.get("content_type") or meta.get("type") or "text" + modality = _normalize_modality(raw_modality) + + page = hit.get("page_number") + if page is not None: + locator = {"kind": "page", "value": page} + citation = f"{source} p.{page}" + elif meta.get("segment_start_seconds") is not None: + locator = {"kind": "segment", "value": meta["segment_start_seconds"]} + citation = f"{source} @{meta['segment_start_seconds']}" + elif meta.get("frame_timestamp_seconds") is not None: + locator = {"kind": "timestamp", "value": meta["frame_timestamp_seconds"]} + citation = f"{source} @{meta['frame_timestamp_seconds']}" + elif meta.get("bbox_xyxy_norm") is not None: + locator = {"kind": "bbox", "value": meta["bbox_xyxy_norm"]} + citation = source + else: + locator = {"kind": "page", "value": None} + citation = source + + fidelity = meta.get("fidelity") or _derive_fidelity(raw_modality, meta, meta) or "verbatim" + + if "_score" in hit and hit["_score"] is not None: + score: float = hit["_score"] + elif "_distance" in hit and hit["_distance"] is not None: + score = hit["_distance"] + else: + score = 0.0 + + return { + "text": hit.get("text", ""), + "source": source, + "locator": locator, + "modality": modality, + "fidelity": fidelity, + "score": score, + "citation": citation, + } + + +def build_evidence_result(hits: list, strategies_used: list[str]) -> dict[str, Any]: + """Assemble the answer-ready ``{evidence, coverage}`` contract shape from raw hits. + + ``evidence`` items are fidelity-tagged and citation-ready; ``coverage`` summarizes + what was searched (``strategies_used``, ``n_docs_seen``) and flags thin spots + (single source, low-fidelity-only, out-of-corpus). This is the shape the skill + reasons over — emitted by ``retriever query --format evidence``. + """ + evidence = [_evidence_item(h) for h in (hits or [])] + sources = {e["source"] for e in evidence if e.get("source")} + thin: list[str] = [] + if not evidence: + thin.append("no matches — likely out of corpus") + else: + if len(sources) == 1: + thin.append("single source") + if all(e["fidelity"] == "vlm_caption" for e in evidence): + thin.append("only low-fidelity (chart/image) evidence") + return { + "evidence": evidence, + "coverage": {"strategies_used": strategies_used, "n_docs_seen": len(sources), "thin_spots": thin}, + } diff --git a/nemo_retriever/src/nemo_retriever/cli/main.py b/nemo_retriever/src/nemo_retriever/cli/main.py index 1a9db0ed7c..860bfee61f 100644 --- a/nemo_retriever/src/nemo_retriever/cli/main.py +++ b/nemo_retriever/src/nemo_retriever/cli/main.py @@ -42,6 +42,7 @@ from nemo_retriever.cli.ingest_workflow import ( run_ingest_workflow, ) +from nemo_retriever.cli.evidence import build_evidence_result from nemo_retriever.cli.query_workflow import query_documents from nemo_retriever.query.options import ( QueryEmbedOptions, @@ -92,11 +93,27 @@ _ROOT_CLI_ERRORS = (OSError, RuntimeError, ValueError, ValidationError) -def _query_cli_hit(hit: RetrievalHit) -> dict[str, object]: +def _query_cli_hit(hit: RetrievalHit, max_text_chars: int | None = None) -> dict[str, object]: + metadata = hit.get("metadata") or {} + modality = hit.get("content_type") or metadata.get("type") or "text" + # Relevance the engine ranked by: hybrid/rerank score if present, else the + # vector distance, else null. Hit ORDER is authoritative; score is informational. + if "_score" in hit and hit["_score"] is not None: + score: object = hit["_score"] + elif "_distance" in hit and hit["_distance"] is not None: + score = hit["_distance"] + else: + score = None + text = hit.get("text", "") + # Compact output: truncate to max_text_chars (0 = metadata-only). None = full text. + if max_text_chars is not None and max_text_chars >= 0 and len(text) > max_text_chars: + text = text[:max_text_chars] + ("…" if max_text_chars > 0 else "") return { "source": hit.get("source", ""), "page_number": hit.get("page_number"), - "text": hit.get("text", ""), + "text": text, + "modality": modality, + "score": score, } @@ -356,6 +373,14 @@ def ingest_command( "table without duplicate checks; rerunning the same inputs in append mode creates duplicates." ), ), + hybrid: bool = typer.Option( + False, + "--hybrid/--no-hybrid", + help=( + "Also build a full-text (BM25) index over the ingested text, so `query --hybrid` can " + "fuse lexical + vector retrieval. Opt-in (default off) — vector-only otherwise." + ), + ), ray_address: str | None = typer.Option(None, "--ray-address", help="Ray address for batch run mode."), ray_log_to_driver: bool | None = typer.Option( None, @@ -696,6 +721,7 @@ def ingest_command( lancedb_uri=lancedb_uri, table_name=table_name, overwrite=overwrite, + hybrid=hybrid, ), ) ) @@ -780,45 +806,92 @@ def query_command( "any of --reranker-invoke-url / --reranker-model-name / --reranker-backend is set." ), ), + hybrid: bool = typer.Option( + False, + "--hybrid/--no-hybrid", + help=( + "Fused vector + full-text (BM25) retrieval; falls back to vector-only if the table " + "has no FTS index. Opt-in (default off) — preserves the legacy vector-only default." + ), + ), + output_format: str = typer.Option( + "hits", + "--format", + help=( + "'hits' (default): raw ranked hit list (source/page/text/modality/score) — the legacy " + "output. 'evidence': answer-ready, fidelity-tagged, cited evidence + coverage (opt-in)." + ), + ), + max_text_chars: int | None = typer.Option( + None, + "--max-text-chars", + help="('hits' format only) Truncate each hit's text to N chars (0 = metadata-only). Default: full text.", + ), ) -> None: + if output_format not in ("hits", "evidence"): + typer.echo(f"Error: unknown --format {output_format!r} (use 'hits' or 'evidence').", err=True) + raise typer.Exit(1) + if max_text_chars is not None and output_format != "hits": + typer.echo("Error: --max-text-chars only applies to --format hits.", err=True) + raise typer.Exit(1) if reranker_invoke_url is None: reranker_invoke_url = os.environ.get("RERANKER_INVOKE_URL") or None if embed_invoke_url is None: embed_invoke_url = os.environ.get("EMBED_INVOKE_URL") or None rerank = rerank or bool(reranker_invoke_url) or bool(reranker_model_name) or bool(reranker_backend) _silence_noisy_libraries() + + def _run(use_hybrid: bool) -> list: + return query_documents( + QueryRequest( + query=query, + retrieval=QueryRetrievalOptions( + top_k=top_k, + candidate_k=candidate_k, + page_dedup=page_dedup, + content_types=content_types, + hybrid=use_hybrid, + ), + embed=QueryEmbedOptions( + embed_invoke_url=embed_invoke_url, + embed_model_name=embed_model_name, + ), + rerank=QueryRerankOptions( + enabled=rerank, + reranker_invoke_url=reranker_invoke_url, + reranker_model_name=reranker_model_name, + reranker_backend=reranker_backend, + ), + storage=QueryStorageOptions( + lancedb_uri=lancedb_uri, + table_name=table_name, + ), + ) + ) + try: with _quiet_capture(): - hits = query_documents( - QueryRequest( - query=query, - retrieval=QueryRetrievalOptions( - top_k=top_k, - candidate_k=candidate_k, - page_dedup=page_dedup, - content_types=content_types, - ), - embed=QueryEmbedOptions( - embed_invoke_url=embed_invoke_url, - embed_model_name=embed_model_name, - ), - rerank=QueryRerankOptions( - enabled=rerank, - reranker_invoke_url=reranker_invoke_url, - reranker_model_name=reranker_model_name, - reranker_backend=reranker_backend, - ), - storage=QueryStorageOptions( - lancedb_uri=lancedb_uri, - table_name=table_name, - ), - ) - ) + if hybrid: + try: + hits = _run(True) + strategies = ["semantic", "lexical"] + except Exception: # noqa: BLE001 — e.g. table has no FTS index; degrade to vector-only + hits = _run(False) + strategies = ["semantic"] + else: + hits = _run(False) + strategies = ["semantic"] except _ROOT_CLI_ERRORS as exc: typer.echo(f"Error: {exc}", err=True) raise typer.Exit(1) from exc - typer.echo(json.dumps([_query_cli_hit(hit) for hit in hits], indent=2, sort_keys=True, default=str)) + if output_format == "evidence": + result = build_evidence_result(hits, strategies) + typer.echo(json.dumps(result, indent=2, sort_keys=True, default=str)) + else: + typer.echo( + json.dumps([_query_cli_hit(hit, max_text_chars) for hit in hits], indent=2, sort_keys=True, default=str) + ) @app.callback() diff --git a/nemo_retriever/src/nemo_retriever/common/vdb/records.py b/nemo_retriever/src/nemo_retriever/common/vdb/records.py index bcf8d0091a..2b498036af 100644 --- a/nemo_retriever/src/nemo_retriever/common/vdb/records.py +++ b/nemo_retriever/src/nemo_retriever/common/vdb/records.py @@ -66,6 +66,25 @@ def _dict_or_empty(value: Any) -> dict[str, Any]: return dict(value) if isinstance(value, dict) else {} +def _derive_fidelity(content_type: Any, metadata: dict[str, Any], content_metadata: dict[str, Any]) -> str | None: + """Map a chunk's modality + real provenance signals to a trust tier. + + verbatim (PDF text layer) > ocr (scanned/region OCR) > transcribed (ASR) > + vlm_caption (chart/image model caption). Returns None for unknown types so + the field is omitted rather than guessed. + """ + t = str(content_type or "").lower() + if t in ("audio", "video", "video_frame"): + return "transcribed" + if t == "image": + return "ocr" if content_metadata.get("subtype") == "page_image" else "vlm_caption" + if t.startswith(("table", "chart", "infographic")): + return "ocr" + if t == "text": + return "ocr" if metadata.get("needs_ocr_for_text") is True else "verbatim" + return None + + def _client_record_from_graph_row(row: dict[str, Any]) -> dict[str, Any] | None: metadata = _dict_or_empty(row.get("metadata")) @@ -84,6 +103,9 @@ def _client_record_from_graph_row(row: dict[str, Any]) -> dict[str, Any] | None: content_type = row.get("_content_type") or row.get("content_type") if content_type: content_metadata.setdefault("type", content_type) + fidelity = _derive_fidelity(content_type, metadata, content_metadata) + if fidelity: + content_metadata.setdefault("fidelity", fidelity) stored_image_uri = row.get("_stored_image_uri") or row.get("stored_image_uri") if stored_image_uri: content_metadata.setdefault("stored_image_uri", stored_image_uri) diff --git a/nemo_retriever/src/nemo_retriever/ingest/plan.py b/nemo_retriever/src/nemo_retriever/ingest/plan.py index cc6d2ceeeb..e2914b5d63 100644 --- a/nemo_retriever/src/nemo_retriever/ingest/plan.py +++ b/nemo_retriever/src/nemo_retriever/ingest/plan.py @@ -201,6 +201,8 @@ class IngestStorageOptions: lancedb_uri: str = "lancedb" table_name: str = "nemo-retriever" overwrite: bool = True + # Also build the LanceDB FTS/BM25 index so `query --hybrid` can fuse lexical + vector. + hybrid: bool = False @dataclass(frozen=True) @@ -625,13 +627,18 @@ def resolve_ingest_plan(request: IngestPlanRequest) -> ResolvedIngestPlan: ) extract_params = ExtractParams(**extract_kwargs) embed_params = EmbedParams(**embed_kwargs) if embed_kwargs else None - vdb_params = VdbUploadParams( - vdb_kwargs={ - "uri": storage.lancedb_uri, - "table_name": storage.table_name, - "overwrite": bool(storage.overwrite), - } - ) + vdb_upload_kwargs = { + "uri": storage.lancedb_uri, + "table_name": storage.table_name, + "overwrite": bool(storage.overwrite), + } + # `hybrid` is a vdb table-build knob, like `overwrite`/`uri`/`table_name` above: it rides + # on storage options and is forwarded as a LanceDB-backend ctor kwarg, where hybrid=True + # makes ingest also build the FTS/BM25 index that `query --hybrid` searches. Injected only + # when opted in, so vector-only ingests keep the exact legacy vdb_kwargs. + if storage.hybrid: + vdb_upload_kwargs["hybrid"] = True + vdb_params = VdbUploadParams(vdb_kwargs=vdb_upload_kwargs) caption_params = _build_caption_params(request.caption) dedup_params = _build_dedup_params(request.dedup) store_params = _build_store_params(request.image_store) diff --git a/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py b/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py index 85364f9cc7..a8f63f9762 100644 --- a/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py +++ b/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py @@ -235,7 +235,9 @@ class LLMJudge: """ _DEFAULT_MODEL: str = "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" - _DEFAULT_SAMPLING: LLMInferenceParams = LLMInferenceParams(temperature=0.1, max_tokens=4096) + # max_tokens must accommodate the Nemotron reasoning block + the final + # {"rating": X}; NVIDIA's llm-judge recipe uses 32768. 4096 truncated mid-think. + _DEFAULT_SAMPLING: LLMInferenceParams = LLMInferenceParams(temperature=0.1, max_tokens=32768) def __init__( self, @@ -296,6 +298,11 @@ def _rate(self, prefix: str, query: str, user_answer: str, reference_answer: str ``_get_judge_rating``: retry on an invalid rating or a transport error up to ``num_retries`` attempts, then give up with NaN. """ + # The prompt already forbids explanation and demands `{"rating": X}`, and + # _parse_rating strips any block — i.e. the judge is designed for the + # Nemotron reasoning model. The only requirement is a large enough max_tokens + # for the reasoning block to finish and still emit the rating (NVIDIA's + # llm-judge recipe uses 32768); 4096 truncated mid-think -> null content. messages = [{"role": "user", "content": _render_prompt(prefix, query, user_answer, reference_answer)}] attempts = max(1, self.transport.num_retries) last_exc: Optional[Exception] = None diff --git a/nemo_retriever/src/nemo_retriever/query/options.py b/nemo_retriever/src/nemo_retriever/query/options.py index 331ac82379..74427a6aed 100644 --- a/nemo_retriever/src/nemo_retriever/query/options.py +++ b/nemo_retriever/src/nemo_retriever/query/options.py @@ -14,6 +14,9 @@ class QueryRetrievalOptions: candidate_k: int | None = None page_dedup: bool = False content_types: str | Sequence[str] | None = None + # Fused vector + full-text (BM25) retrieval. Opt-in (default off) preserves the + # legacy vector-only path; requires the LanceDB table to carry an FTS index. + hybrid: bool = False @dataclass(frozen=True) diff --git a/nemo_retriever/src/nemo_retriever/query/workflow.py b/nemo_retriever/src/nemo_retriever/query/workflow.py index 3a92ed1ea4..8133d11f6d 100644 --- a/nemo_retriever/src/nemo_retriever/query/workflow.py +++ b/nemo_retriever/src/nemo_retriever/query/workflow.py @@ -35,12 +35,16 @@ def _build_rerank_kwargs(options: QueryRerankOptions) -> dict[str, str]: def _build_retriever_kwargs(request: QueryRequest) -> dict[str, Any]: embed_kwargs = build_embed_option_kwargs(request.embed.embed_invoke_url, request.embed.embed_model_name) + vdb_kwargs: dict[str, Any] = { + "uri": request.storage.lancedb_uri, + "table_name": request.storage.table_name, + } + # Only inject hybrid when opted in, so the vector-only path stays byte-for-byte legacy. + if request.retrieval.hybrid: + vdb_kwargs["hybrid"] = True retriever_kwargs: dict[str, Any] = { "top_k": request.retrieval.top_k, - "vdb_kwargs": { - "uri": request.storage.lancedb_uri, - "table_name": request.storage.table_name, - }, + "vdb_kwargs": vdb_kwargs, } if embed_kwargs: retriever_kwargs["embed_kwargs"] = embed_kwargs diff --git a/nemo_retriever/tests/test_fidelity.py b/nemo_retriever/tests/test_fidelity.py new file mode 100644 index 0000000000..1d8e395d38 --- /dev/null +++ b/nemo_retriever/tests/test_fidelity.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from nemo_retriever.common.vdb.records import _client_record_from_graph_row, _derive_fidelity + + +def _fidelity_of(row: dict) -> object: + rec = _client_record_from_graph_row(row) + assert rec is not None + return rec["metadata"]["content_metadata"].get("fidelity") + + +def _row(content_type, *, needs_ocr=None, subtype=None) -> dict: + meta: dict = {"embedding": [0.1, 0.2]} + if needs_ocr is not None: + meta["needs_ocr_for_text"] = needs_ocr + cm: dict = {"page_number": 1} + if subtype is not None: + cm["subtype"] = subtype + meta["content_metadata"] = cm + return {"text": "x", "metadata": meta, "_content_type": content_type} + + +def test_derive_fidelity_pure_mapping() -> None: + assert _derive_fidelity("text", {}, {}) == "verbatim" + assert _derive_fidelity("text", {"needs_ocr_for_text": True}, {}) == "ocr" + assert _derive_fidelity("image", {}, {}) == "vlm_caption" + assert _derive_fidelity("image", {}, {"subtype": "page_image"}) == "ocr" + assert _derive_fidelity("table", {}, {}) == "ocr" + assert _derive_fidelity("chart_caption", {}, {}) == "ocr" + assert _derive_fidelity("audio", {}, {}) == "transcribed" + assert _derive_fidelity("video", {}, {}) == "transcribed" + assert _derive_fidelity("", {}, {}) is None + assert _derive_fidelity("mystery", {}, {}) is None + + +def test_fidelity_stamped_into_stored_record() -> None: + assert _fidelity_of(_row("text")) == "verbatim" + assert _fidelity_of(_row("text", needs_ocr=True)) == "ocr" + assert _fidelity_of(_row("image")) == "vlm_caption" + assert _fidelity_of(_row("image", subtype="page_image")) == "ocr" + assert _fidelity_of(_row("table")) == "ocr" + assert _fidelity_of(_row("audio")) == "transcribed" diff --git a/nemo_retriever/tests/test_lancedb_retrieval_where.py b/nemo_retriever/tests/test_lancedb_retrieval_where.py index 15eca336c2..0d614aef2c 100644 --- a/nemo_retriever/tests/test_lancedb_retrieval_where.py +++ b/nemo_retriever/tests/test_lancedb_retrieval_where.py @@ -49,7 +49,7 @@ def _tiny_table(uri: str, *, create_fts_index: bool = False) -> None: def test_retrieval_where_filters_rows() -> None: d = tempfile.mkdtemp() _tiny_table(d) - op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False) + op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False) qv = [1.0, 0.0] unfiltered = op.retrieval([qv], top_k=10, table_path=d, table_name="t") assert len(unfiltered[0]) == 2 @@ -61,7 +61,7 @@ def test_retrieval_where_filters_rows() -> None: def test_retrieval_filter_alias() -> None: d = tempfile.mkdtemp() _tiny_table(d) - op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False) + op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False) qv = [1.0, 0.0] filtered = op.retrieval([qv], top_k=10, table_path=d, table_name="t", _filter="text = 'beta'") assert len(filtered[0]) == 1 @@ -71,7 +71,7 @@ def test_retrieval_filter_alias() -> None: def test_retrieval_where_precedence_over_filter() -> None: d = tempfile.mkdtemp() _tiny_table(d) - op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False) + op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False) qv = [1.0, 0.0] filtered = op.retrieval( [qv], @@ -88,7 +88,7 @@ def test_retrieval_where_precedence_over_filter() -> None: def test_retrieval_metadata_like_predicate() -> None: d = tempfile.mkdtemp() _tiny_table(d) - op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False) + op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False) qv = [1.0, 0.0] pred = '%"doc_id": "x"%' filtered = op.retrieval([qv], top_k=10, table_path=d, table_name="t", where=f"metadata LIKE '{pred}'") diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index ea4ddc8e82..ce706c5e9d 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -234,7 +234,7 @@ def test_structured_construction_uses_defaults(self): judge = LLMJudge(transport=transport) assert judge.model == "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" assert judge.sampling.temperature == 0.1 - assert judge.sampling.max_tokens == 4096 + assert judge.sampling.max_tokens == 32768 def test_custom_sampling_override(self): from nemo_retriever.models.llm.clients import LLMJudge @@ -263,7 +263,7 @@ def test_from_kwargs_matches_structured(self): assert judge.transport.extra_params == {"user": "t"} # Sampling stays at judge defaults even when using flat constructor. assert judge.sampling.temperature == 0.1 - assert judge.sampling.max_tokens == 4096 + assert judge.sampling.max_tokens == 32768 def test_from_kwargs_accepts_sampling_overrides(self): from nemo_retriever.models.llm.clients import LLMJudge @@ -362,7 +362,7 @@ def test_judging_operator_constructs_cleanly(self): op = JudgingOperator(model="nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5") assert op._judge.model == "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" assert op._judge.sampling.temperature == 0.1 - assert op._judge.sampling.max_tokens == 4096 + assert op._judge.sampling.max_tokens == 32768 def test_judging_operator_plumbs_num_retries_to_inner_judge(self): """JudgingOperator(num_retries=...) must flow down to the LLMJudge it diff --git a/nemo_retriever/tests/test_root_cli_workflow.py b/nemo_retriever/tests/test_root_cli_workflow.py index 6e92fa1a09..c7b7cc9b9a 100644 --- a/nemo_retriever/tests/test_root_cli_workflow.py +++ b/nemo_retriever/tests/test_root_cli_workflow.py @@ -1160,3 +1160,25 @@ def fake_quiet_capture() -> Any: assert silenced == [True] assert captured_use == [True] assert "Ingested 1 file(s) → 3 row(s) in LanceDB lancedb/nemo-retriever." in result.output + + +def test_root_ingest_passes_hybrid_into_vdb_kwargs(monkeypatch, tmp_path) -> None: + fake_ingestor = _make_fake_ingestor() + doc = tmp_path / "a.pdf" + doc.write_bytes(b"%PDF-1.4\n") + + monkeypatch.setattr(ingest_execution, "create_ingestor", lambda **_: fake_ingestor) + monkeypatch.setattr(ingest_execution, "_count_lancedb_rows", lambda *_, **__: 1) + + result = RUNNER.invoke( + cli_main.app, + ["ingest", str(doc), "--lancedb-uri", "/tmp/lancedb", "--table-name", "docs", "--hybrid"], + ) + + assert result.exit_code == 0 + assert fake_ingestor.vdb_upload.call_args.args[0].vdb_kwargs == { + "uri": "/tmp/lancedb", + "table_name": "docs", + "overwrite": True, + "hybrid": True, + } diff --git a/nemo_retriever/tests/test_root_query_cli.py b/nemo_retriever/tests/test_root_query_cli.py index 45715b2859..ba8ca528a9 100644 --- a/nemo_retriever/tests/test_root_query_cli.py +++ b/nemo_retriever/tests/test_root_query_cli.py @@ -37,8 +37,8 @@ def test_root_query_passes_query_options_and_prints_json(monkeypatch) -> None: }, ] expected_output = [ - {"source": "doc.pdf", "page_number": 1, "text": "passage"}, - {"source": "other.pdf", "page_number": 2, "text": "other"}, + {"source": "doc.pdf", "page_number": 1, "text": "passage", "modality": "text", "score": 0.2}, + {"source": "other.pdf", "page_number": 2, "text": "other", "modality": "table", "score": 0.4}, ] class FakeRetriever: @@ -108,7 +108,7 @@ def query(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: assert result.exit_code == 0 assert query_kwargs == [{"candidate_k": 3, "page_dedup": True, "content_types": "text,table"}] assert json.loads(result.output) == [ - {"page_number": 1, "source": "doc.pdf", "text": "text row"}, + {"page_number": 1, "source": "doc.pdf", "text": "text row", "modality": "text", "score": None}, ] @@ -277,3 +277,52 @@ def fail_query_documents(*_args: Any, **_kwargs: Any) -> list[dict[str, Any]]: assert result.exit_code == 1 assert "Error: database unavailable" in result.output + + +def test_root_query_passes_hybrid_into_vdb_kwargs(monkeypatch) -> None: + retriever_calls: list[dict[str, Any]] = [] + + class FakeRetriever: + def __init__(self, **kwargs: Any) -> None: + retriever_calls.append(kwargs) + + def query(self, query: str, **_kwargs: Any) -> list[dict[str, Any]]: + return [] + + monkeypatch.setattr(query_core, "Retriever", FakeRetriever) + + result = RUNNER.invoke( + cli_main.app, + ["query", "q", "--top-k", "5", "--lancedb-uri", "/tmp/lancedb", "--table-name", "docs", "--hybrid"], + ) + + assert result.exit_code == 0 + assert retriever_calls == [ + {"top_k": 5, "vdb_kwargs": {"uri": "/tmp/lancedb", "table_name": "docs", "hybrid": True}} + ] + + +def test_root_query_max_text_chars_truncates_and_omits(monkeypatch) -> None: + hits = [{"text": "abcdefghij", "source": "d.pdf", "page_number": 1, "metadata": {"type": "text"}, "_distance": 0.1}] + + class FakeRetriever: + def __init__(self, **_: Any) -> None: + pass + + def query(self, query: str, **_kwargs: Any) -> list[dict[str, Any]]: + return hits + + monkeypatch.setattr(query_core, "Retriever", FakeRetriever) + + snip = RUNNER.invoke(cli_main.app, ["query", "q", "--max-text-chars", "5"]) + assert snip.exit_code == 0 + snip_hit = json.loads(snip.output)[0] + assert snip_hit["text"] == "abcde…" + assert snip_hit["modality"] == "text" # non-text fields intact + assert snip_hit["source"] == "d.pdf" + + meta = RUNNER.invoke(cli_main.app, ["query", "q", "--max-text-chars", "0"]) + meta_hit = json.loads(meta.output)[0] + assert meta_hit["text"] == "" + assert meta_hit["source"] == "d.pdf" + assert meta_hit["page_number"] == 1 diff --git a/skills/nemo-retriever/SKILL.md b/skills/nemo-retriever/SKILL.md index 1292e65c4d..e3ef29c6fd 100644 --- a/skills/nemo-retriever/SKILL.md +++ b/skills/nemo-retriever/SKILL.md @@ -9,7 +9,7 @@ allowed-tools: Bash Write Read The `retriever` CLI indexes a folder of PDFs into LanceDB (`retriever ingest`) and serves vector search over it (`retriever query`). For any task about searching/answering questions across a folder of PDFs, use this CLI — do not write a custom RAG. -**Beyond PDFs and beyond semantic search.** `retriever ingest` also handles images, Office, HTML, TXT, audio, and video — see `references/setup.md` for the per-format recipe and `references/install.md` for the install extras (`[multimedia]`, libreoffice, ffmpeg). For non-semantic operations — page filter, verbatim quote with citation, corpus-level aggregate, chart/image caption hits — see `references/query.md`. Don't fall back to native Read/Grep/Python on non-PDF inputs. +**Beyond PDFs and beyond semantic search.** `retriever ingest` also handles images, Office, HTML, TXT, audio, and video — see `references/setup.md` for the per-format recipe and `references/install.md` for the install extras (`[multimedia]`, libreoffice, ffmpeg). The query turn is a single command — see **§Query turn** below (inline, no reference read needed); `references/cli/query.md` holds only the fallback detail (exact-term, chart text-extract, compose-reply). Don't fall back to native Read/Grep/Python on non-PDF inputs. ## Install (if `retriever` is missing) @@ -20,17 +20,27 @@ If `command -v retriever` returns nothing, follow `references/install.md` to ins | Turn type | Read this once | Then execute | | :--- | :--- | :--- | | **Setup turn** (first turn — `./lancedb/nemo-retriever.lance` doesn't exist) | `references/setup.md` | Build the index | -| **Query turn** (every subsequent turn — user asks a question) | `references/query.md` | One `retriever query` call, then `Write` `./output.json` *(eval-harness contract only — for general use, just answer in chat; see `query.md` top callout)* | +| **Query turn** (every subsequent turn — user asks a question) | **§Query turn** below (command inline — no reference read needed) | Run it, then `Write` `./output.json` *(eval-harness contract only — for general use, just answer in chat)* | | Anything errored or returned empty | `references/troubleshooting.md` | Apply the named recovery; do not improvise | -For the full `retriever ingest` / `retriever query` CLI specs, see `references/cli/ingest.md` and `references/cli/query.md`. You do not need these for routine turns — `/bin/retriever --help` is faster. +## Query turn — run this, then write the answer + +`/bin/retriever query "" --format evidence --hybrid --top-k 10` → JSON +`{ evidence: [ { text, source, locator, modality, fidelity, score, citation } ], coverage: {...} }`. +That's the FIRST (usually only) call — don't `ls`/`find`/`sed`/Read to orient first; it already searched the whole corpus. Then: +- **Lead with the direct answer** (the exact figure, or Yes/No) for the exact entity asked; address every entity / year / category the question names — even "not provided". +- **Trust by fidelity** (`verbatim > ocr > transcribed > vlm_caption`): a number or directional claim resting ONLY on a `vlm_caption` (chart/image) is unconfirmed — quote it tagged "(chart-derived, unconfirmed)" unless a higher-fidelity item states the same fact. Never fabricate from adjacent text. +- Re-`query` only if the answer isn't yet supported — once per genuinely distinct sub-question (per entity when comparing/listing), or with the exact term when `coverage.thin_spots` flags a miss. +- Open `references/cli/query.md` ONLY for the fallback path (exact-term re-query, chart text-extract, compose-reply detail) — a normal answer needs none of it. + +For the full `retriever ingest` CLI spec, see `references/cli/ingest.md`. For `retriever query` flags, `/bin/retriever query --help` is authoritative (and faster) — you do not need it for routine turns. Before ingesting a mixed folder, inventory extensions (`find -name '*.*' | sed 's/.*\.//' | sort -u`) — `--input-type=auto` silently drops anything outside the supported set. See `references/troubleshooting.md` "Unsupported file types". ## Hard limits (apply to every turn) - **Setup turn**: build the index in one shell command (see `references/setup.md`). STOP after the index lands. -- **Query turn**: at most **2 Bash calls** — 1 `retriever query`, +1 optional targeted text-extract per `references/query.md`. Reply and then STOP. +- **Query turn**: at most **2 Bash calls** — 1 `retriever query`, +1 optional targeted text-extract per `references/cli/query.md`. Reply and then STOP. - **No narration between tool calls.** Tokens you emit between calls become input + cached input for every later turn — quadratic cost. Go straight from reading the summary to writing the JSON file. - **Banned**: `TodoWrite`, Glob, Grep, `Read` of whole PDFs, re-running setup, spawning subagents, speculative "confirmation" calls. diff --git a/skills/nemo-retriever/contract/CONTRACT.md b/skills/nemo-retriever/contract/CONTRACT.md new file mode 100644 index 0000000000..14c56f0e9d --- /dev/null +++ b/skills/nemo-retriever/contract/CONTRACT.md @@ -0,0 +1,47 @@ +# retriever skill↔engine contract + +`contract_version` (see `cli-contract.json`) is the semver the **skill** asserts +about the installed **engine**. Run `scripts/doctor.py` to verify the installed +`retriever` satisfies it. + +The skill's one primitive is **`retriever query --format evidence --hybrid`** → +`{ evidence, coverage }`. The `query` engine defaults are `--format hits` (a flat ranked +list) and vector-only (`--hybrid` off); the skill opts into `--format evidence` +(fidelity-tagged evidence + coverage) and `--hybrid` (vector+BM25) **explicitly**, so +plain `query` callers are unaffected. `query` *also* exposes `--rerank`, `--candidate-k`, +`--content-types`, `--page-dedup` (unused by the skill); the contract gates the skill's +invocation + result shape, not the full flag surface. + +## Files +- `cli-contract.json` — the gated surface: required subcommands, `query`'s required + flags + default format/hybrid, and `ingest`'s flags. `default_table_name` is the + engine's table-name constant (operator config), not the skill name. +- `query-result.schema.json` — the shape `retriever query --format evidence` emits and the + skill reasons over: `evidence[]` (each with `text, source, locator, modality, + fidelity, score, citation`) + `coverage`. This is THE contract the skill relies on. + +## Versioning +- Bump **patch** for clarifications, **minor** for additive engine capabilities the + skill can use, **major** when the engine changes something the skill relies on + (a `query` evidence/coverage field, the default `--format`/`--hybrid` behavior, or + the gated primitive). A major bump means the skill must be updated in the same change. +- `doctor.py` fails if the installed engine no longer matches `cli-contract.json` / + `query-result.schema.json`. + +## How drift gets caught +`doctor.py` runs on the skill's setup turn. It +performs a LIVE probe — ingest a tiny built-in document, run `retriever query --format evidence`, +validate `{evidence, coverage}` (including the `fidelity` enum) +against `query-result.schema.json` — plus static `--help` checks: the required +subcommands (`ingest`, `query`) exist and `query` exposes its required +flags (`--top-k`, `--hybrid`, `--format`). Any divergence (a renamed evidence field, a +missing `fidelity`, a dropped `--format`, `--input-type` reappearing on `ingest`) fails +loudly with a remediation hint. + +## Changelog +- **0.1.0** — skill-first contract built around **`retriever query --format evidence --hybrid`** + → `{evidence, coverage}` (validated against `query-result.schema.json`). The gated + subcommands are `ingest` and `query`; `query`'s engine defaults are `--format hits` and + vector-only, and the skill passes `--format evidence`/`--hybrid` explicitly. + `query` may expose extra knobs (`--rerank`, `--candidate-k`, …) — they're allowed but unused + by the skill, so the contract gates the invocation + result shape, not the full flag surface. diff --git a/skills/nemo-retriever/contract/cli-contract.json b/skills/nemo-retriever/contract/cli-contract.json new file mode 100644 index 0000000000..4332ce3813 --- /dev/null +++ b/skills/nemo-retriever/contract/cli-contract.json @@ -0,0 +1,20 @@ +{ + "contract_version": "0.1.0", + "primitive": "query", + "subcommands_required": ["ingest", "query"], + "requires_hybrid_index": true, + "query": { + "required_flags": ["--top-k", "--hybrid", "--format"], + "operator_flags": ["--lancedb-uri", "--table-name", "--embed-model-name"], + "skill_invocation": "retriever query \"\" --format evidence --hybrid", + "engine_default_format": "hits", + "engine_default_hybrid": false, + "result_schema": "query-result.schema.json" + }, + "ingest": { + "required_flags": ["--append", "--overwrite", "--hybrid", "--ocr-version", "--ocr-lang", "--table-name", "--lancedb-uri", "--embed-model-name"], + "forbidden_flags": ["--input-type"], + "single_pass_multiformat": true + }, + "default_table_name": "nemo-retriever" +} diff --git a/skills/nemo-retriever/contract/query-result.schema.json b/skills/nemo-retriever/contract/query-result.schema.json new file mode 100644 index 0000000000..ec7d742a87 --- /dev/null +++ b/skills/nemo-retriever/contract/query-result.schema.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "retriever/query-result", + "title": "retriever query --format evidence result (engine, contract 0.1.0)", + "description": "What `retriever query --format evidence` emits and the skill reasons over (query's default output is the legacy `--format hits` flat list).", + "type": "object", + "required": ["evidence", "coverage"], + "properties": { + "evidence": { "type": "array", "items": { "$ref": "#/$defs/evidence_item" } }, + "coverage": { "$ref": "#/$defs/coverage" } + }, + "additionalProperties": true, + "$defs": { + "evidence_item": { + "type": "object", + "required": ["text", "source", "locator", "modality", "fidelity", "score", "citation"], + "properties": { + "text": { "type": "string" }, + "source": { "type": "string", "description": "source basename (.pdf stripped)" }, + "locator": { "type": "object", "description": "{ kind: page|segment|timestamp|bbox, value }" }, + "modality": { "type": "string", "description": "text | table | chart | image | audio | video_frame" }, + "fidelity": { "type": "string", "enum": ["verbatim", "ocr", "transcribed", "vlm_caption"] }, + "score": { "type": "number", "description": "relevance/distance; hit ORDER is authoritative" }, + "citation": { "type": "string", "description": "source + locator, ready to cite" } + }, + "additionalProperties": true + }, + "coverage": { + "type": "object", + "required": ["strategies_used", "n_docs_seen", "thin_spots"], + "properties": { + "strategies_used": { "type": "array" }, + "n_docs_seen": { "type": "integer" }, + "thin_spots": { "type": "array" } + }, + "additionalProperties": true + } + } +} diff --git a/skills/nemo-retriever/references/cli/query.md b/skills/nemo-retriever/references/cli/query.md index dc9ed5309d..5f7587183f 100644 --- a/skills/nemo-retriever/references/cli/query.md +++ b/skills/nemo-retriever/references/cli/query.md @@ -1,91 +1,34 @@ -# retriever query - -Embed a text query and return the top-k nearest rows from a LanceDB table -previously written by `retriever ingest` (or any compatible pipeline). - -If flags below look stale, re-check `retriever query --help`. - -## When to use this - -- You have already ingested documents and want to retrieve relevant - chunks/primitives for a natural-language query. -- You want a one-shot CLI lookup — no service, no UI. - -**Use a different command when:** - -- You want recall metrics over a labelled query set → `retriever recall`. -- You want to grade end-to-end QA quality → `retriever eval`. -- You want a long-running query endpoint → `retriever service`. -- You want to compare two retrieval runs → `retriever compare`. - -## Canonical invocations - -Top-10 search against the default table: +# Query turn — the WHOLE workflow ```bash -/bin/retriever query "what is in chart 1?" +timeout 2000 /bin/retriever query "" --format evidence --hybrid --top-k 10 \ + --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --query-embed-backend hf \ + | tee ./evidence.json ``` -Top-3, custom table: +That's your FIRST tool call on every query turn, run **exactly** as one pipeline (cold runs take ~20–30s; wait for it — don't background it or fire parallel queries). Do not Read, Glob, Grep, or list PDFs first — those duplicate what `retriever query` already did. `--format evidence` returns answer-ready JSON: -```bash -/bin/retriever query "average frequency ranges for tweeters" \ - --top-k 3 \ - --lancedb-uri ./my-lancedb \ - --table-name my-corpus +``` +{ "evidence": [ { text, source, locator, modality, fidelity, score, citation } ], "coverage": {...} } ``` -## Inputs - -- **Positional `QUERY`** — single text string. Required. Quote it in the shell - to keep multi-word queries intact. - -## Outputs +`tee ./evidence.json` keeps the full result in the cwd (not `/tmp` — clobbered under parallel queries). Read it back only as needed (`/bin/python -c "import json; print(json.load(open('./evidence.json'))['evidence'][0]['text'])"`); pulling all chunks' text into context inflates cached prompt size on every later turn. -- JSON array on stdout, one object per hit, in retriever ranking order. -- The root CLI intentionally returns compact objects: - - `source` — origin document path. - - `page_number` — 1-indexed page when available. - - `text` — retrieved primitive text, table text, chart text, or image caption. -- Internal scores, raw metadata, and bounding boxes are available from the Python - `Retriever.query(...)` API, not the public root CLI output. +**No narration between tool calls.** Do not write "Let me search…", "The retriever returned…", or any commentary — every token between the `query` call and the `Write` of `./output.json` becomes input (and cached input) for every later turn (quadratic cost). Go straight from reading the result to writing the file. -## Key flags +Each evidence item carries: `text`, `source` (doc basename), `locator` (`{kind: page, value: }`), `modality` (`text|table|chart|image|audio|video_frame`), **`fidelity`** (`verbatim > ocr > transcribed > vlm_caption`), `score`, and `citation` (ready-to-quote source + locator). Hit ORDER is authoritative; `score` is informational. -| Flag | Default | Notes | -|---|---|---| -| `--top-k` | `10` | Final number of hits to return. Must be >= 1. | -| `--candidate-k` | unset | Wider pre-filter/pre-dedup candidate pool. When set, it must be >= `--top-k`; make it larger when `--page-dedup` or `--content-types` could reduce final hits. | -| `--page-dedup` | `false` | Collapse results to unique document pages. | -| `--content-types` | unset | Comma-separated content types to keep, such as `text,table` or `image,chart`; query-time values are normalized to canonical hit metadata types, `images` is accepted as an alias for captioned image rows, and untyped hits are excluded. | -| `--lancedb-uri` | `lancedb` | Must match what `ingest` wrote to. | -| `--table-name` | `nemo-retriever` | Must match what `ingest` wrote to. | +## Trust by fidelity — the core of a correct answer -## Ranking interpretation +A number or directional claim resting ONLY on a `vlm_caption` (chart/image transcription) is **unconfirmed** — chart transcriptions often flip direction words (`increase`↔`decrease`) or misread exact figures. Prefer `verbatim`/`ocr`/`table` evidence for exact values. If the figure you need appears only in a `vlm_caption`, quote it verbatim and tag "(chart-derived, unconfirmed)" unless a higher-fidelity item states the same fact. Never upgrade a low-fidelity reading to a confident fact. -- The embedder (`llama-nemotron-embed-vl-1b-v2`) returns mean-pooled vectors; - LanceDB ranks by L2 distance by default. The root CLI hides raw distance values; - treat result order as ranking-only, not calibrated confidence. -- The query uses the **VL** variant of the embedder so text queries can match - ingested image/chart embeddings as well as text. Expect mixed-modality hits - in the result list. +## When the answer isn't in the first result -## Common failure modes +Re-`query` only when the top evidence doesn't yet answer — for a genuinely *distinct* sub-question (per entity when comparing/listing), or **with the exact term/phrase** when `coverage.thin_spots` flags a miss or a specific ID/code/figure isn't in the returned text (the fused BM25 leg matches exact strings semantic search skips — e.g. re-query `"mRNA-1273"` to surface every chunk that names it). Read `coverage.thin_spots` to tell "broaden the search" from "out of corpus". Do NOT re-issue reworded variants of the same question, reach for `pdftotext`/`pdfgrep`, or open the LanceDB table yourself — `query` already searched the whole corpus. -- **Empty result array** — table is empty (no ingest run yet) or - `--table-name` / `--lancedb-uri` don't match where ingest wrote. -- **`Table 'nemo-retriever' was not found`** — same root cause: wrong table/URI, - or ingest hasn't been run. -- **First query is slow (~10–15s)** — vLLM startup for the query embedder. - Subsequent queries in the same process are sub-second; one-shot CLI - invocations always pay this cost. -- **Surprisingly low-relevance top hit** — for very short corpora, even - unrelated queries return *something*. Broaden with `--candidate-k`, use - `--page-dedup` for page diversity, or use `--content-types` for targeted - table/chart/image-caption searches. +## Compose your reply from the evidence -## Related +- `final_answer`: **lead with the direct answer** — the exact figure (in the evidence's own units) or a bare Yes/No, for the exact entity asked — then support it. Synthesize from the evidence `text`. One paragraph, no restating the question, no hedging caveats. **Re-read the question**: address every entity / year / category it names, even those the evidence marks "not provided" (missing entities lose more judge points than imprecise numbers). If the asked-for fact isn't in the evidence, say so explicitly — never invent or extrapolate from adjacent material. +- `ranked_retrieved`: one entry per evidence item in returned order: `{"doc_id": "", "page_number": , "rank": }`. Up to 10. **Indexing:** `locator.value` is 1-indexed; if the task's schema says 0-indexed, emit `value - 1`, else emit as-is. -- [[ingest]] — populate the table this command reads. -- `retriever recall --help` — batch query → recall@k against ground truth. -- `retriever eval --help` — end-to-end QA evaluation. +After your reply, STOP. No print, no summary, no further tool calls. diff --git a/skills/nemo-retriever/references/query.md b/skills/nemo-retriever/references/query.md deleted file mode 100644 index 8d451d837d..0000000000 --- a/skills/nemo-retriever/references/query.md +++ /dev/null @@ -1,71 +0,0 @@ -# Query turn — the WHOLE workflow - - -```bash -timeout 2000 /bin/retriever query "" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --query-embed-backend hf --reranker-backend hf --rerank \ - | tee ./hits.json \ - | /bin/python -c "import json,sys,os; [print(f'rank={i+1} page={h[\"page_number\"]} doc={os.path.basename(h[\"source\"])}') for i,h in enumerate(json.load(sys.stdin))]" -``` - -Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full hits land in `./hits.json` **in the current working directory** (not `/tmp` — a shared `/tmp` path gets clobbered when queries run in parallel). The summary above lists rank/page/doc — to read hit text for synthesizing `final_answer`, parse `./hits.json` directly. The top hit's text is one one-liner away: `/bin/python -c "import json; print(json.load(open('./hits.json'))[0]['text'])"` (or `[i]` for the rank-(i+1) hit). Fetch only what you need — pulling all 10 hits' text into context inflates cached prompt size on every subsequent turn. - -That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did. - -`--query-embed-backend hf` and `--reranker-backend hf` run the query embedder and reranker via HuggingFace instead of vLLM: a single query then loads in ~20–30s (vLLM's batch engine cold-starts much slower and hogs GPU memory). Same model, same hits — just a faster, lighter cold start for one-off queries. (Ingest still uses vLLM for batch throughput.) - -**No narration between tool calls.** Do not write "Let me search…", "I'll now analyze…", "The retriever returned…", or any other commentary. Every assistant token you emit between the `retriever query` Bash call and the `Write` of `./output.json` becomes input tokens (and cached input tokens) for every subsequent turn in this session — quadratic cost. Go straight from reading the summary to writing the JSON file. The only assistant text in a query turn should be the tool calls themselves. - -Each hit has exactly three keys: `source` (the **full PDF path** — the doc_id is its basename, `os.path.basename(h["source"])[:-4]` to drop `.pdf`), `page_number` (int, **1-indexed**: the first page of a PDF is page `1`), and `text`. There is no `pdf_basename`, `metadata`, `pdf_page`, or `_distance` field — referencing those raises `KeyError`. - -## Keyword/regex search across the corpus - -If you need exact text matches that semantic `retriever query` may have skipped — e.g. "find every mention of 'mRNA-1273' across all PDFs" — use: - -```bash -/bin/python /scripts/grep_corpus.py "" [--max-hits 50] -``` - -It scans the LanceDB table the retriever already built — no PDF re-extraction. Output is `:p:: ......` per hit; `NO_MATCH` if nothing. Counts against the same "one optional follow-up call" budget as the targeted text-extract (mutually exclusive — pick one). - -Don't reach for `pdftotext`, `pdftohtml`, or `pdfgrep` — they're system tools that aren't guaranteed installed on the user's machine. The retriever venv bundles pdfium and `lancedb`; `grep_corpus.py` and `retriever pdf stage page-elements --method pdfium` cover the same use cases without that dependency. - -## Compose your reply from the hits - -- `final_answer`: synthesize from the top hits' `text`. Include the exact number / name / date / row / column the question asks for, plus the source PDF and 0-indexed page. One paragraph. No restating the question, no hedging caveats. If the chunks talk *around* the fact but don't state it, run ONE `/bin/retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json` and `Read` `/tmp/pdf_text/.pdf.pdf_extraction.json` for the rank-1 page (or rank-2 if rank-1 is metadata) — that almost always surfaces the exact figure. Then synthesize. **If after both calls the asked-for fact still isn't in the evidence, write `final_answer` that says so explicitly** — e.g. "The retrieved pages do not state [X] for [entity]; the closest content is [Y]." Do NOT invent, extrapolate, or generate plausible-sounding content from adjacent material. A confidently-wrong answer scores worse than an honest "not in the retrieved pages". -- `ranked_retrieved`: one entry per hit in the order `retriever query` returned: `{"doc_id": "", "page_number": , "rank": }`. Up to 10. Duplicate `(doc, page)` is fine. **Indexing:** the retriever's `page_number` is 1-indexed. If the task's output schema says 0-indexed (e.g. "first page is page 0"), emit `hit.page_number - 1`; if the task says 1-indexed or doesn't specify, emit `hit.page_number` as-is. - -**Before writing `final_answer`, re-read the question.** If it lists multiple entities, years, or categories, your answer must address each one explicitly — even if for some of them the chunks say "not provided" or contain no data. Missing entities lose more judge points than imprecise numbers. - -## Charts and images — the single biggest source of judge=2/3 trials - -When `metadata.type` of a hit is `chart` or `image`, its `text` field is a model-generated transcription that frequently: - -- reverses direction words (`increase`↔`decrease`, `rose`↔`fell`, `surge`↔`drop`), and -- rounds or misreads exact percentages (e.g. transcribing 12% as 20%). - -If a question asks for an exact percentage or a directional claim **and the evidence is only a chart/image hit** (no `text`-type hit corroborates the same number or direction): - -1. Run the targeted `/bin/retriever pdf stage page-elements --method pdfium` text-extract on the rank-1 PDF (this counts as your second tool call) and look for the number in prose. -2. If prose confirms the chart number, assert it confidently. -3. If prose doesn't mention it, **quote the chart transcription verbatim with an explicit hedge in `final_answer`**: "The chart on page N indicates [verbatim phrase] (chart-derived, not verified against prose)." Do NOT restate the chart's number as a confident fact. - -When both a chart hit and a text hit cover the same fact, always prefer the text hit's number. -After your reply, STOP. No print, no summary, no further tool calls. - -## Non-semantic operations (use these, don't fall back to native tools) - -**Page filter** — "what's on page N of doc.pdf" → filter LanceDB directly, no `Read`: - -```bash -/bin/python -c "import lancedb,json; df=lancedb.connect('./lancedb').open_table('nemo-retriever').to_pandas(); print('\n'.join(r['text'] for _,r in df.iterrows() if json.loads(r['source'])['source_name']=='APPLE_2022_10K.pdf' and json.loads(r['metadata'])['page_number']==14))" -``` - -**Verbatim quote with `[page]` citation** — quote retrieved chunks with `[page N]` markers in `final_answer`; don't paraphrase. - -**Corpus-level aggregate** — "list distinct sources", "count chunks per source" → no `ls`/`grep`/`find`: - -```bash -/bin/python -c "import lancedb,json; from collections import Counter; df=lancedb.connect('./lancedb').open_table('nemo-retriever').to_pandas(); names=[json.loads(s)['source_name'] for s in df['source']]; print(sorted(set(names))); print(dict(Counter(names)))" -``` - -**Image / chart captioning** — when the user asks to *describe / caption* an image (prose summary, not OCR text): `retriever ingest` already produces chart/image-type hits whose `text` field is the model-generated caption (see "Charts and images" above). Workflow: ingest the image folder (`setup.md` image recipe), then `retriever query` with a topic-related question — the hits with `metadata.type=chart|image` carry the caption in `text`. Use that as `final_answer`. No separate captioning CLI command. diff --git a/skills/nemo-retriever/references/setup.md b/skills/nemo-retriever/references/setup.md index 18f30fa80a..18dd0740c0 100644 --- a/skills/nemo-retriever/references/setup.md +++ b/skills/nemo-retriever/references/setup.md @@ -6,13 +6,13 @@ TOTAL_PAGES=$(/bin/python -c "import pypdfium2, glob; print(sum(len(pypdfium2.PdfDocument(p)) for p in glob.glob('./pdfs/*.pdf')))" 2>/dev/null || echo 0) echo "total_pages=$TOTAL_PAGES" if [ "$TOTAL_PAGES" -le 50000 ]; then - /bin/retriever ingest ./pdfs/ --embed-model-name nvidia/llama-nemotron-embed-1b-v2 + /bin/retriever ingest ./pdfs/ --hybrid --embed-model-name nvidia/llama-nemotron-embed-1b-v2 else - /bin/retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever"}' --quiet + /bin/retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever", "hybrid": true}' --quiet fi ``` -Both branches write the **same** LanceDB table, `lancedb/nemo-retriever` — the table `retriever query` reads by default. `retriever ingest` defaults to that table automatically; `retriever pipeline run` has no `--table-name` flag and would otherwise default to `nv-ingest`, so the `else` branch pins it with `--vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever"}'`. Keep these aligned or queries will read an empty table. +`--hybrid` (and `"hybrid": true` in the pipeline branch's `--vdb-kwargs-json`) builds a full-text (BM25) index alongside the vectors, so `retriever query`'s fused lexical leg can match exact terms (IDs, codes, rare phrases); without it, hybrid queries silently fall back to vector-only. Both branches write the **same** LanceDB table, `lancedb/nemo-retriever` — the table `retriever query` reads by default. `retriever ingest` defaults to that table automatically; `retriever pipeline run` has no `--table-name` flag and would otherwise default to `nv-ingest`, so the `else` branch pins it with `--vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever", "hybrid": true}'`. Keep these aligned or queries will read an empty table. `retriever ingest` is quiet by default; the `else` (`retriever pipeline run`) branch needs `--quiet` passed explicitly. Quiet mode suppresses progress bars, HuggingFace download logs, vLLM init noise, Ray worker stdout, and INFO-level pipeline status lines on success, while still flushing captured output to stderr on error. Without it the `pipeline run` branch burns thousands of tokens on irrelevant progress output. On success you only see one line: `Ingested N document(s) into LanceDB lancedb/nemo-retriever.` (for `retriever ingest`) or `Pipeline complete: N page(s) → lancedb lancedb/nemo-retriever (T.Ts).` (for `retriever pipeline run`). @@ -24,7 +24,7 @@ After the setup command returns successfully, STOP. Don't run smoke queries to " ## Other input shapes -Same `retriever ingest` command, different `--input-type` and (for non-PDF) install extras. Install extras live in `references/install.md` "Optional extras". +Same `retriever ingest` command, different `--input-type` and (for non-PDF) install extras. Add `--hybrid` to each (as in the PDF recipe) so exact-term search works. Install extras live in `references/install.md` "Optional extras". **Images / scanned forms / charts** (`.jpg` `.png` `.tiff` `.bmp`): diff --git a/skills/nemo-retriever/references/troubleshooting.md b/skills/nemo-retriever/references/troubleshooting.md index eaf93188b6..e210e6ac67 100644 --- a/skills/nemo-retriever/references/troubleshooting.md +++ b/skills/nemo-retriever/references/troubleshooting.md @@ -2,7 +2,7 @@ Read this only after you hit one of the named errors below. Don't read it pre-emptively. -## If the index is missing or `retriever query` returns `[]` +## If the index is missing or `retriever query` returns empty `evidence` Means ingest didn't complete (e.g. the text-only pipeline still hit the turn wall, or the table is empty). Tight fallback using the retriever's own pdfium-based extractor (always available — same binary the agent just used for `retriever query`): @@ -18,10 +18,10 @@ For an unlisted subcommand: `/bin/retriever --help` ## Failure modes (expected, not errors) - **First `ingest` takes ~60s+** — vLLM warmup. Expected. -- **First `query` is slow** — embedder (and reranker, with `--rerank`) cold-start. ~10–15s on an idle GPU, but **1–3 minutes under concurrent load**. Expected — wait for it; do not kill or relaunch. It is wrapped in `timeout 2000`, so let it run to that ceiling before treating it as failed. -- **Empty result** — ingest didn't run. Use the fallback above. +- **First `query` is slow** — embedder cold-start. ~10–15s on an idle GPU, but **1–3 minutes under concurrent load**. Expected — wait for it; do not kill or relaunch. It is wrapped in `timeout 2000`, so let it run to that ceiling before treating it as failed. +- **Empty `evidence`** — ingest didn't run (use the fallback above), or the question is genuinely out-of-corpus — read `coverage.thin_spots` to tell which. - **`Clamping num_partitions ...`** — informational on tiny corpora, not an error. -- **Low-relevance top hit on tiny corpus** — look at `_distance` *gaps* between hits, not absolute values. +- **Low-relevance top hit on tiny corpus** — even an unrelated query returns *something*; trust the ranking order (the `score` field is informational, not calibrated confidence). - **Page-element-detection warnings during ingest** — non-fatal as long as the embedding step itself succeeds (and they're silenced on a successful run, since `ingest` is quiet by default). ## Unsupported file types (silent filter — the v2 regression mode) diff --git a/skills/nemo-retriever/scripts/doctor.py b/skills/nemo-retriever/scripts/doctor.py new file mode 100644 index 0000000000..9e4b547ba9 --- /dev/null +++ b/skills/nemo-retriever/scripts/doctor.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Verify the installed `retriever` engine satisfies the skill's contract. + +Usage: /bin/python skills/retriever/scripts/doctor.py +Exits 0 if all checks pass, 1 otherwise. Always runs a LIVE ingest+query probe. + +The skill's one primitive is `retriever query --format evidence --hybrid` -> +{evidence, coverage}; this doctor gates on THAT invocation and result shape. `query`'s +DEFAULTS are unchanged (legacy `hits` output, vector-only) — `evidence`/`hybrid` are +opt-in flags the skill passes, so neither is gated here. +""" +import json +import os +import shutil +import subprocess +import sys +import tempfile + +HERE = os.path.dirname(os.path.abspath(__file__)) +CONTRACT_DIR = os.path.join(os.path.dirname(HERE), "contract") +EMBED_MODEL = "nvidia/llama-nemotron-embed-1b-v2" +# Tiny self-contained probe doc, written to a temp corpus so the live ingest+query +# check needs no external fixture file. +PROBE_TEXT = ( + "Contract probe document.\n" + "The capital of the test corpus is Probeville.\n" + "This single short text file exists only so doctor.py can ingest one tiny " + "document and run one query to assert the live hit schema.\n" +) + +results = [] # (ok: bool, label: str, detail: str) + + +def check(ok, label, detail=""): + results.append((bool(ok), label, detail)) + + +def retriever_bin(): + return shutil.which("retriever") + + +def help_text(bin_path, subcmd): + # Force a wide terminal so the rich/click help box does not truncate long + # flag names (e.g. "--embed-model-na…"), which would break substring checks. + env = dict(os.environ, COLUMNS="200") + try: + out = subprocess.run([bin_path, subcmd, "--help"], capture_output=True, text=True, timeout=60, env=env) + return (out.stdout or "") + (out.stderr or "") + except Exception as e: # noqa: BLE001 + return f"__ERROR__ {e}" + + +def main(): + with open(os.path.join(CONTRACT_DIR, "cli-contract.json")) as _f: + contract = json.load(_f) + with open(os.path.join(CONTRACT_DIR, "query-result.schema.json")) as _f: + rr_schema = json.load(_f) + item_schema = rr_schema["$defs"]["evidence_item"] + cov_schema = rr_schema["$defs"]["coverage"] + + bin_path = retriever_bin() + check( + bin_path is not None, "retriever CLI on PATH", "" if bin_path else "run skills/retriever/references/install.md" + ) + if not bin_path: + return report() + + # --- Required subcommands exist (static, no GPU) --- + for sub in contract.get("subcommands_required", []): + try: + rc = subprocess.run([bin_path, sub, "--help"], capture_output=True, text=True, timeout=60).returncode + except Exception: # noqa: BLE001 + rc = 1 + check(rc == 0, f"subcommand `{sub}` exists") + + # --- query flag surface: required flags present (static). `query` is now both the + # skill primitive and the power-user tool, so strategy knobs are allowed. --- + rhelp = help_text(bin_path, "query") + for flag in contract["query"]["required_flags"]: + check(flag in rhelp, f"query has {flag}") + + # --- ingest flag surface (static, no GPU) --- + ihelp = help_text(bin_path, "ingest") + for flag in contract["ingest"]["required_flags"]: + check(flag in ihelp, f"ingest has {flag}") + for flag in contract["ingest"]["forbidden_flags"]: + check( + flag not in ihelp, f"ingest does NOT have {flag}", "engine changed: skill assumes single-pass auto-detect" + ) + + # --- Live probe: ingest tiny fixture, retrieve, validate result shape (GPU) --- + tmp = tempfile.mkdtemp(prefix="retriever_doctor_") + try: + corpus = os.path.join(tmp, "corpus") + os.makedirs(corpus) + with open(os.path.join(corpus, "contract_probe.txt"), "w") as probe_f: + probe_f.write(PROBE_TEXT) + uri = os.path.join(tmp, "lancedb") + table = "contract_probe" + ing = subprocess.run( + [ + bin_path, + "ingest", + corpus + "/", + "--table-name", + table, + "--lancedb-uri", + uri, + "--embed-model-name", + EMBED_MODEL, + "--quiet", + ], + capture_output=True, + text=True, + timeout=900, + ) + check(ing.returncode == 0, "live ingest of fixture", ing.stderr.strip()[-300:]) + + r = subprocess.run( + [ + bin_path, + "query", + "What is the capital of the test corpus?", + "--format", + "evidence", + "--top-k", + "3", + "--no-hybrid", + "--table-name", + table, + "--lancedb-uri", + uri, + "--embed-model-name", + EMBED_MODEL, + ], + capture_output=True, + text=True, + timeout=600, + ) + check(r.returncode == 0, "live query --format evidence", r.stderr.strip()[-300:]) + result = None + if r.returncode == 0: + try: + result = json.loads(r.stdout) + check( + isinstance(result, dict) and "evidence" in result and "coverage" in result, + "query emits {evidence, coverage}", + ) + except Exception as e: # noqa: BLE001 + check(False, "query stdout is JSON", str(e)) + if isinstance(result, dict): + ev = result.get("evidence") + check(isinstance(ev, list) and len(ev) > 0, "query returned evidence") + if isinstance(ev, list) and ev: + ok, why = validate(ev[0], item_schema) + check(ok, "evidence item matches query-result schema", why) + cov = result.get("coverage") + check(isinstance(cov, dict), "coverage is an object") + if isinstance(cov, dict): + ok, why = validate(cov, cov_schema) + check(ok, "coverage matches query-result schema", why) + finally: + shutil.rmtree(tmp, ignore_errors=True) + + return report() + + +def validate(obj, schema): + """Tiny dependency-free validator: required fields, types (incl. unions), enums.""" + if not isinstance(obj, dict): + return False, "value is not an object" + for req in schema.get("required", []): + if req not in obj: + return False, f"missing required field '{req}'" + types = { + "integer": int, + "string": str, + "number": (int, float), + "object": dict, + "array": list, + "null": type(None), + "boolean": bool, + } + for name, spec in schema.get("properties", {}).items(): + if name not in obj: + continue + if "type" in spec: + allowed = spec["type"] if isinstance(spec["type"], list) else [spec["type"]] + pytypes = [] + for key in allowed: + mapped = types.get(key) + if mapped is None: + continue + pytypes.extend(mapped if isinstance(mapped, tuple) else [mapped]) + if pytypes and not isinstance(obj[name], tuple(pytypes)): + return False, f"field '{name}' should be {spec['type']}, got {type(obj[name]).__name__}" + if "enum" in spec and obj[name] not in spec["enum"]: + return False, f"field '{name}'={obj[name]!r} not in {spec['enum']}" + return True, "" + + +def report(): + failed = [r for r in results if not r[0]] + for ok, label, detail in results: + mark = "PASS" if ok else "FAIL" + line = f"[{mark}] {label}" + if detail and not ok: + line += f" -- {detail}" + print(line) + print(f"\n{len(results) - len(failed)}/{len(results)} checks passed") + return 1 if failed else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/nemo-retriever/scripts/filename_fast_path.py b/skills/nemo-retriever/scripts/filename_fast_path.py deleted file mode 100644 index f11bfd8223..0000000000 --- a/skills/nemo-retriever/scripts/filename_fast_path.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Query-turn filename fast path for the nemo-retriever skill. - -Reads `./pdfs/` from the current working directory. If the query string -literally contains any PDF basename (with or without the `.pdf` extension, -stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements` -on each matched file via pdfium, ranks pages by query-token frequency, -and emits a top-10 ranking + the top page's raw text. - -Invoked from SKILL.md as: - /bin/python /scripts/filename_fast_path.py "$QUERY" - -The retriever binary is resolved from sys.executable's directory, so the -script is portable across venvs. - -Stdout protocol (exactly one of): -- `NO_MATCH\n` — no PDF basename in the query. -- `NO_TEXT\n` — matches found but extraction produced no - text on any page (image-only PDFs). -- `\n---TOP_PAGE_TEXT---\n` — JSON with a "ranking" list of - {doc_id, page_number, rank} (1-indexed - pages, up to 10), followed by the top- - ranked page's raw text (first 4000 chars). - -Exit code is 0 in all three success outcomes; non-zero only on hard errors -(missing ./pdfs, page-elements subprocess failure, malformed sidecar JSON). -""" - -from __future__ import annotations - -import json -import os -import re -import subprocess -import sys - -PDF_DIR = "./pdfs" -EXTRACT_OUT = "/tmp/pdf_text" -MIN_STEM_LEN = 6 -TOP_K = 10 -TOP_PAGE_TEXT_CHARS = 4000 - -STOPWORDS = frozenset( - "the a an of in on for to and or is are was were what which how when " - "where who why this that these those with by from as at be it its do " - "does did please could would should tell me you i we us our my".split() -) - - -def find_matches(query_lower: str, basenames: list[str]) -> list[str]: - """Return PDF basenames whose name (with or without .pdf) appears verbatim - in the lowercased query. Skip stems shorter than MIN_STEM_LEN.""" - matches = [] - for name in basenames: - stem, ext = os.path.splitext(name) - if ext.lower() != ".pdf" or len(stem) < MIN_STEM_LEN: - continue - if name.lower() in query_lower or stem.lower() in query_lower: - matches.append(name) - return matches - - -def extract_pages(retriever_bin: str, matches: list[str]) -> None: - os.makedirs(EXTRACT_OUT, exist_ok=True) - for m in matches: - subprocess.run( - [ - retriever_bin, - "pdf", - "stage", - "page-elements", - f"{PDF_DIR}/{m}", - "--method", - "pdfium", - "--json-output-dir", - EXTRACT_OUT, - "--compact-json", - ], - check=True, - ) - - -def sidecar_path(pdf_name: str) -> str | None: - stem = os.path.splitext(pdf_name)[0] - candidates = ( - f"{EXTRACT_OUT}/{pdf_name}.pdf_extraction.json", - f"{EXTRACT_OUT}/{stem}.pdf.pdf_extraction.json", - ) - for c in candidates: - if os.path.exists(c): - return c - return None - - -def page_records(sidecar: str) -> list[dict]: - data = json.load(open(sidecar)) - if isinstance(data, list): - return data - if isinstance(data, dict): - return data.get("pages") or data.get("documents") or [] - return [] - - -def page_text(rec: dict) -> str: - txt = rec.get("text") or rec.get("content") or "" - if not txt and isinstance(rec.get("primitives"), list): - txt = " ".join(p.get("text", "") for p in rec["primitives"] if isinstance(p, dict)) - return txt or "" - - -def tokenize(query: str) -> list[str]: - return [t for t in re.split(r"[^a-z0-9]+", query.lower()) if t and t not in STOPWORDS and len(t) > 2] - - -def rank_pages(matches: list[str], toks: list[str]) -> list[tuple[int, int, str, str]]: - """Return list of (score, page_number, doc_stem, text) sorted by - descending score, ascending page number.""" - scored = [] - for m in matches: - sidecar = sidecar_path(m) - if sidecar is None: - continue - stem = os.path.splitext(m)[0] - for rec in page_records(sidecar): - pn = rec.get("page_number") or rec.get("page") or 0 - txt = page_text(rec) - score = sum(txt.lower().count(t) for t in toks) - if score > 0: - scored.append((score, pn, stem, txt)) - scored.sort(key=lambda r: (-r[0], r[1])) - return scored - - -def main() -> int: - if len(sys.argv) != 2: - print(f"usage: {sys.argv[0]} ", file=sys.stderr) - return 2 - query = sys.argv[1] - ql = query.lower() - retriever_bin = os.path.join(os.path.dirname(sys.executable), "retriever") - - basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf")) - matches = find_matches(ql, basenames) - if not matches: - print("NO_MATCH") - return 0 - - extract_pages(retriever_bin, matches) - scored = rank_pages(matches, tokenize(ql)) - if not scored: - print("NO_TEXT") - return 0 - - ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} for i, s in enumerate(scored[:TOP_K])] - print(json.dumps({"ranking": ranking})) - print("---TOP_PAGE_TEXT---") - print(scored[0][3][:TOP_PAGE_TEXT_CHARS]) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/skills/nemo-retriever/scripts/grep_corpus.py b/skills/nemo-retriever/scripts/grep_corpus.py deleted file mode 100644 index 1471b6e4c0..0000000000 --- a/skills/nemo-retriever/scripts/grep_corpus.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Case-insensitive keyword/regex search over the corpus via the LanceDB index. - -This script scans the already-built LanceDB table, so it returns matches -across every chunk `retriever ingest` indexed (text, table, chart, image -transcriptions where present) without re-reading any PDF. - -Usage: - /bin/python /scripts/grep_corpus.py \\ - [--max-hits 50] [--lancedb-uri ./lancedb] [--table-name nemo-retriever] - -`pattern` is a Python regex, case-insensitive. For a literal-string search, -just write the string — most identifier characters (`.`, `-`, `_`, digits, -letters) are unambiguous unless you include regex metacharacters -(`(`, `|`, `*`, `?`, `[`, `]`, `\\`, `^`, `$`). - -Output (one line per hit; sorted by pdf_basename then page_number): - :p:: ...... - -Prints `NO_MATCH` on zero hits. Caps at `--max-hits` to keep the turn output -bounded; raise it if you really want more. -""" - -from __future__ import annotations - -import argparse -import json -import re -import sys - - -def main() -> int: - ap = argparse.ArgumentParser() - ap.add_argument("pattern", help="Python regex (case-insensitive)") - ap.add_argument("--max-hits", type=int, default=50) - ap.add_argument("--snippet-pad", type=int, default=60) - ap.add_argument("--lancedb-uri", default="./lancedb") - ap.add_argument("--table-name", default="nemo-retriever") - args = ap.parse_args() - - try: - import lancedb - except ImportError: - print("ERROR: lancedb not importable. Run with /bin/python.", file=sys.stderr) - return 1 - - try: - pat = re.compile(args.pattern, re.IGNORECASE) - except re.error as e: - print(f"ERROR: bad regex {args.pattern!r}: {e}", file=sys.stderr) - return 2 - - try: - db = lancedb.connect(args.lancedb_uri) - tbl = db.open_table(args.table_name) - except Exception as e: - print(f"ERROR: can't open lancedb table {args.table_name!r} at " f"{args.lancedb_uri!r}: {e}", file=sys.stderr) - return 1 - - rows = tbl.to_pandas() - if "text" not in rows.columns: - print(f"ERROR: lancedb table has no 'text' column. columns={list(rows.columns)}", file=sys.stderr) - return 1 - - hits = [] - for row in rows.itertuples(index=False): - text = getattr(row, "text", "") or "" - m = pat.search(text) - if not m: - continue - pdf = getattr(row, "pdf_basename", "?") - page = getattr(row, "page_number", "?") - meta_raw = getattr(row, "metadata", "") or "" - if isinstance(meta_raw, str): - try: - meta = json.loads(meta_raw) if meta_raw else {} - except json.JSONDecodeError: - meta = {} - elif isinstance(meta_raw, dict): - meta = meta_raw - else: - meta = {} - type_ = meta.get("type", "?") - start = max(0, m.start() - args.snippet_pad) - end = min(len(text), m.end() + args.snippet_pad) - snippet = text[start:end].replace("\n", " ") - hits.append((pdf, page, type_, snippet)) - - hits.sort(key=lambda h: (str(h[0]), int(h[1]) if isinstance(h[1], (int, float)) else 0)) - for pdf, page, type_, snippet in hits[: args.max_hits]: - print(f"{pdf}:p{page}:{type_}: ...{snippet}...") - if not hits: - print("NO_MATCH") - elif len(hits) > args.max_hits: - print(f"... ({len(hits) - args.max_hits} more matches truncated; " f"raise --max-hits to see them)") - return 0 - - -if __name__ == "__main__": - sys.exit(main())