diff --git a/agent_eval/build_report.py b/agent_eval/build_report.py
index 461614d813..83fc866c5f 100644
--- a/agent_eval/build_report.py
+++ b/agent_eval/build_report.py
@@ -10,7 +10,7 @@
 when several runs are passed.
 
 Unlike the runner, this script MAY import ``nemo_retriever`` to reuse
-``score.recall_at_k`` and ``llm.clients.judge.LLMJudge`` (it runs where the
+``score.recall_at_k`` and ``models.llm.clients.judge.LLMJudge`` (it runs where the
 codebase exists). Both imports degrade gracefully if unavailable.
 
 Usage:
@@ -136,6 +136,12 @@ def load_gold(manifest_path: Path) -> dict[str, Gold]:
 _PIPELINE_SEP = re.compile(r"(?:;|&&|\|\||\||\n|\$\(|`)")
 _ENV_ASSIGN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
 _WRAPPERS = {"sudo", "time", "nice", "nohup", "exec", "env", "command", "builtin"}
+# Leading shell keywords that precede a command inside control flow, e.g.
+# `if [ -n "$RETRIEVER_VENV" ]; then "$RETRIEVER_VENV/bin/retriever" query ...`
+# or `... fi "$RETRIEVER_BIN" query ...`. After splitting, the retriever segment may
+# start with `then`/`else`/`fi`/`done` etc., which must be stripped or the head token
+# reads as the keyword, not the command.
+_SHELL_KW = {"then", "do", "else", "elif", "fi", "done", "{"}
 _TIMEOUT_VAL_FLAGS = {"-k", "--kill-after", "-s", "--signal"}
 _PARSE_ERR = re.compile(r"pdf_basename|JSONDecodeError|Extra data|_default_decoder|KeyError", re.I)
 # The baseline profile installs a PATH shim that prints this and exits 127. A
@@ -159,7 +165,7 @@ def _strip_wrappers(seg: str) -> list[str]:
             if i < len(toks):  # the DURATION token
                 i += 1
             continue
-        if t in _WRAPPERS:
+        if t in _WRAPPERS or t in _SHELL_KW:
             i += 1
             continue
         break
@@ -170,7 +176,10 @@ def _seg_is_retriever(seg: str) -> bool:
     toks = _strip_wrappers(seg.strip())
     if not toks:
         return False
-    h = toks[0]
+    # Strip surrounding quotes so a guarded/quoted binary path like
+    # `"$RETRIEVER_VENV/bin/retriever"` is recognized (the var stays unexpanded,
+    # but the literal still ends in `/retriever`).
+    h = toks[0].strip("'\"")
     if h == "retriever" or h.endswith("/retriever"):
         return True
     if len(toks) >= 3 and toks[0] == "uv" and toks[1] == "run" and toks[2] == "retriever":
@@ -180,8 +189,33 @@ def _seg_is_retriever(seg: str) -> bool:
     return False
 
 
+# Var names assigned from a resolved retriever binary, e.g.
+# `RETRIEVER_BIN="$(command -v retriever)"`, `RETRIEVER_BIN="$RETRIEVER_VENV/bin/retriever"`,
+# or `RETRIEVER=retriever`. The agent builds these to harden against an unset
+# RETRIEVER_VENV, then invokes `"$RETRIEVER_BIN" query ...` — whose head is a $var,
+# not a `retriever` literal, so it slips past _seg_is_retriever.
+_RETR_VAR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)=[^\n;]*\bretriever\b")
+_VAR_REF = re.compile(r"^\$\{?([A-Za-z_]\w*)\}?$")
+
+
+def _retriever_bin_vars(cmd: str) -> set[str]:
+    return set(_RETR_VAR_ASSIGN.findall(cmd or ""))
+
+
 def cmd_uses_retriever(cmd: str) -> bool:
-    return any(_seg_is_retriever(s) for s in _PIPELINE_SEP.split(cmd or ""))
+    segs = _PIPELINE_SEP.split(cmd or "")
+    if any(_seg_is_retriever(s) for s in segs):
+        return True
+    # Variable indirection: VAR=<...retriever...> earlier, then `"$VAR" <subcommand>`.
+    rvars = _retriever_bin_vars(cmd)
+    if rvars:
+        for s in segs:
+            toks = _strip_wrappers(s.strip())
+            if len(toks) >= 2:  # head is the binary, plus at least a subcommand
+                m = _VAR_REF.match(toks[0].strip("'\""))
+                if m and m.group(1) in rvars:
+                    return True
+    return False
 
 
 def _retriever_piped_to_parser(cmd: str) -> bool:
@@ -191,6 +225,10 @@ def _retriever_piped_to_parser(cmd: str) -> bool:
 
 _CODEX_EXIT_RE = re.compile(r"exited with code (\d+)")
 _HITS_JSON_RE = re.compile(r'"page_number"')
+# Codex backgrounds a slow command (~1s yield) → output says "Process running with
+# session ID <n>"; the agent then polls it via a function_call whose arguments carry
+# {"session_id": <n>}. The clean exit lands on that poll, not the original query call.
+_BG_SESSION_RE = re.compile(r"running with session(?:\s+ID)?\s+(\d+)", re.I)
 
 
 def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
@@ -207,6 +245,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
         return {"attempted": False, "clean": False, "engine": False}
     calls: dict[str, str] = {}
     outs: dict[str, str] = {}
+    polls: dict[str, str] = {}  # call_id -> polled session_id (background-continue calls)
     hits_seen = False
     for line in agent_log.read_text().splitlines():
         if not line.strip():
@@ -227,6 +266,9 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
             if isinstance(cmd, list):
                 cmd = " ".join(str(x) for x in cmd)
             calls[p.get("call_id")] = str(cmd)
+            sid = a.get("session_id")
+            if sid is not None:
+                polls[p.get("call_id")] = str(sid)
         elif p.get("type") == "function_call_output":
             o = p.get("output")
             if isinstance(o, dict):
@@ -236,6 +278,7 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
             if _HITS_JSON_RE.search(o) and ('"source"' in o or '"text"' in o):
                 hits_seen = True
     attempted = clean = engine = False
+    retr_sessions: set[str] = set()  # session IDs opened by a backgrounded retriever query
     for cid, cmd in calls.items():
         if not cmd_uses_retriever(cmd):
             continue
@@ -247,11 +290,30 @@ def detect_retriever_usage_codex(agent_log: Path) -> dict[str, bool]:
         if m and m.group(1) == "0":
             clean = True
             engine = True
-    # Codex backgrounds `retriever query` (1s yield), so its hits often arrive in a
-    # later polled output rather than a clean exit. Credit that to engine — but ONLY
-    # when a real retriever-query command was attempted, so direct LanceDB pandas
-    # reads (which also emit page_number/source/text) aren't miscounted. Guarantees
-    # engine ⊆ attempted.
+        bg = _BG_SESSION_RE.search(out)  # query backgrounded → remember its session
+        if bg:
+            retr_sessions.add(bg.group(1))
+    # A backgrounded `retriever query` finishes in a later session-poll, not the original
+    # call — credit that poll's clean exit to the query. Follow re-yields (a poll may
+    # background again under a new session id) to a fixpoint, bounded by #polls.
+    if retr_sessions and not clean:
+        changed = True
+        while changed and not clean:
+            changed = False
+            for cid, sid in polls.items():
+                if sid not in retr_sessions:
+                    continue
+                out = outs.get(cid, "")
+                m = _CODEX_EXIT_RE.search(out)
+                if m and m.group(1) == "0":
+                    clean = engine = True
+                    break
+                nb = _BG_SESSION_RE.search(out)
+                if nb and nb.group(1) not in retr_sessions:
+                    retr_sessions.add(nb.group(1))
+                    changed = True
+    # Hits-JSON fallback (query returned results even if no clean exit was captured),
+    # gated on a real retriever attempt so direct LanceDB reads aren't miscounted.
     if attempted and hits_seen:
         engine = True
     return {"attempted": attempted, "clean": clean, "engine": engine}
@@ -398,7 +460,7 @@ def build_judge(model: str, api_base: str | None, api_key_env: str):
         print(f"  judge disabled: ${api_key_env} not set", file=sys.stderr)
         return None
     try:
-        from nemo_retriever.llm.clients.judge import LLMJudge  # type: ignore
+        from nemo_retriever.models.llm.clients.judge import LLMJudge  # type: ignore
     except Exception as exc:  # noqa: BLE001
         print(f"  judge disabled: cannot import LLMJudge ({exc})", file=sys.stderr)
         return None
@@ -415,7 +477,13 @@ def _load_judge_cache(run_dir: Path) -> dict[str, tuple]:
         r = json.loads(rp.read_text())
     except Exception:
         return {}
-    return {q["query_id"]: (q.get("judge_score"), q.get("judge_error", "")) for q in r.get("per_query", [])}
+    # Only reuse SUCCESSFUL judgements; a prior run that scored None (e.g. judge
+    # import broken / disabled) must not poison re-judging into skipping forever.
+    return {
+        q["query_id"]: (q.get("judge_score"), q.get("judge_error", ""))
+        for q in r.get("per_query", [])
+        if q.get("judge_score") is not None
+    }
 
 
 def apply_judge(
diff --git a/agent_eval/profiles.py b/agent_eval/profiles.py
index e68d52d52c..311796b84c 100644
--- a/agent_eval/profiles.py
+++ b/agent_eval/profiles.py
@@ -186,8 +186,13 @@ def build_query_workdir(*, base_dir: Path, query_dir: Path, profile: str, agent:
     """Create a per-query workdir that symlinks the shared base contents."""
     wd = query_dir / "workdir"
     wd.mkdir(parents=True, exist_ok=True)
+    # AGENT_EVAL_NO_PDFS=1 withholds the raw ./pdfs from the agent so it cannot
+    # fall back to reading source files (pdftotext/pypdf) — forces retriever use.
+    no_pdfs = os.environ.get("AGENT_EVAL_NO_PDFS") == "1"
     # Shared, read-only-ish artifacts: symlink to the base.
     for name in ("pdfs", "lancedb"):
+        if name == "pdfs" and no_pdfs:
+            continue
         src = base_dir / name
         dst = wd / name
         if src.exists() and not dst.exists():
diff --git a/nemo_retriever/src/nemo_retriever/cli/evidence.py b/nemo_retriever/src/nemo_retriever/cli/evidence.py
new file mode 100644
index 0000000000..505827becc
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/cli/evidence.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Answer-ready ``{evidence, coverage}`` shaping for ``retriever query --format evidence``.
+
+The skill reasons over this shape: each evidence item is fidelity-tagged and
+citation-ready, and ``coverage`` summarizes what was searched and flags thin spots.
+``--format evidence`` is opt-in; ``query``'s default output stays the flat hit list.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from nemo_retriever.common.vdb.records import _derive_fidelity
+
+_KNOWN_MODALITIES = {"text", "table", "chart", "image", "audio", "video_frame"}
+
+
+def _normalize_modality(value: Any) -> str:
+    m = str(value or "text").lower()
+    if m in _KNOWN_MODALITIES:
+        return m
+    if m.startswith("table"):
+        return "table"
+    if m.startswith("chart"):
+        return "chart"
+    if m.startswith(("image", "infographic")):
+        return "image"
+    if m.startswith("video"):
+        return "video_frame"
+    if m.startswith("audio"):
+        return "audio"
+    return "text"
+
+
+def _evidence_item(hit: dict[str, Any]) -> dict[str, Any]:
+    meta = hit.get("metadata") if isinstance(hit.get("metadata"), dict) else {}
+    src_raw = hit.get("pdf_basename") or hit.get("source") or ""
+    source = os.path.basename(str(src_raw))
+    if source.lower().endswith(".pdf"):
+        source = source[:-4]
+    raw_modality = hit.get("content_type") or meta.get("type") or "text"
+    modality = _normalize_modality(raw_modality)
+
+    page = hit.get("page_number")
+    if page is not None:
+        locator = {"kind": "page", "value": page}
+        citation = f"{source} p.{page}"
+    elif meta.get("segment_start_seconds") is not None:
+        locator = {"kind": "segment", "value": meta["segment_start_seconds"]}
+        citation = f"{source} @{meta['segment_start_seconds']}"
+    elif meta.get("frame_timestamp_seconds") is not None:
+        locator = {"kind": "timestamp", "value": meta["frame_timestamp_seconds"]}
+        citation = f"{source} @{meta['frame_timestamp_seconds']}"
+    elif meta.get("bbox_xyxy_norm") is not None:
+        locator = {"kind": "bbox", "value": meta["bbox_xyxy_norm"]}
+        citation = source
+    else:
+        locator = {"kind": "page", "value": None}
+        citation = source
+
+    fidelity = meta.get("fidelity") or _derive_fidelity(raw_modality, meta, meta) or "verbatim"
+
+    if "_score" in hit and hit["_score"] is not None:
+        score: float = hit["_score"]
+    elif "_distance" in hit and hit["_distance"] is not None:
+        score = hit["_distance"]
+    else:
+        score = 0.0
+
+    return {
+        "text": hit.get("text", ""),
+        "source": source,
+        "locator": locator,
+        "modality": modality,
+        "fidelity": fidelity,
+        "score": score,
+        "citation": citation,
+    }
+
+
+def build_evidence_result(hits: list, strategies_used: list[str]) -> dict[str, Any]:
+    """Assemble the answer-ready ``{evidence, coverage}`` contract shape from raw hits.
+
+    ``evidence`` items are fidelity-tagged and citation-ready; ``coverage`` summarizes
+    what was searched (``strategies_used``, ``n_docs_seen``) and flags thin spots
+    (single source, low-fidelity-only, out-of-corpus). This is the shape the skill
+    reasons over — emitted by ``retriever query --format evidence``.
+    """
+    evidence = [_evidence_item(h) for h in (hits or [])]
+    sources = {e["source"] for e in evidence if e.get("source")}
+    thin: list[str] = []
+    if not evidence:
+        thin.append("no matches — likely out of corpus")
+    else:
+        if len(sources) == 1:
+            thin.append("single source")
+        if all(e["fidelity"] == "vlm_caption" for e in evidence):
+            thin.append("only low-fidelity (chart/image) evidence")
+    return {
+        "evidence": evidence,
+        "coverage": {"strategies_used": strategies_used, "n_docs_seen": len(sources), "thin_spots": thin},
+    }
diff --git a/nemo_retriever/src/nemo_retriever/cli/main.py b/nemo_retriever/src/nemo_retriever/cli/main.py
index 1a9db0ed7c..860bfee61f 100644
--- a/nemo_retriever/src/nemo_retriever/cli/main.py
+++ b/nemo_retriever/src/nemo_retriever/cli/main.py
@@ -42,6 +42,7 @@
 from nemo_retriever.cli.ingest_workflow import (
     run_ingest_workflow,
 )
+from nemo_retriever.cli.evidence import build_evidence_result
 from nemo_retriever.cli.query_workflow import query_documents
 from nemo_retriever.query.options import (
     QueryEmbedOptions,
@@ -92,11 +93,27 @@
 _ROOT_CLI_ERRORS = (OSError, RuntimeError, ValueError, ValidationError)
 
 
-def _query_cli_hit(hit: RetrievalHit) -> dict[str, object]:
+def _query_cli_hit(hit: RetrievalHit, max_text_chars: int | None = None) -> dict[str, object]:
+    metadata = hit.get("metadata") or {}
+    modality = hit.get("content_type") or metadata.get("type") or "text"
+    # Relevance the engine ranked by: hybrid/rerank score if present, else the
+    # vector distance, else null. Hit ORDER is authoritative; score is informational.
+    if "_score" in hit and hit["_score"] is not None:
+        score: object = hit["_score"]
+    elif "_distance" in hit and hit["_distance"] is not None:
+        score = hit["_distance"]
+    else:
+        score = None
+    text = hit.get("text", "")
+    # Compact output: truncate to max_text_chars (0 = metadata-only). None = full text.
+    if max_text_chars is not None and max_text_chars >= 0 and len(text) > max_text_chars:
+        text = text[:max_text_chars] + ("…" if max_text_chars > 0 else "")
     return {
         "source": hit.get("source", ""),
         "page_number": hit.get("page_number"),
-        "text": hit.get("text", ""),
+        "text": text,
+        "modality": modality,
+        "score": score,
     }
 
 
@@ -356,6 +373,14 @@ def ingest_command(
             "table without duplicate checks; rerunning the same inputs in append mode creates duplicates."
         ),
     ),
+    hybrid: bool = typer.Option(
+        False,
+        "--hybrid/--no-hybrid",
+        help=(
+            "Also build a full-text (BM25) index over the ingested text, so `query --hybrid` can "
+            "fuse lexical + vector retrieval. Opt-in (default off) — vector-only otherwise."
+        ),
+    ),
     ray_address: str | None = typer.Option(None, "--ray-address", help="Ray address for batch run mode."),
     ray_log_to_driver: bool | None = typer.Option(
         None,
@@ -696,6 +721,7 @@ def ingest_command(
                         lancedb_uri=lancedb_uri,
                         table_name=table_name,
                         overwrite=overwrite,
+                        hybrid=hybrid,
                     ),
                 )
             )
@@ -780,45 +806,92 @@ def query_command(
             "any of --reranker-invoke-url / --reranker-model-name / --reranker-backend is set."
         ),
     ),
+    hybrid: bool = typer.Option(
+        False,
+        "--hybrid/--no-hybrid",
+        help=(
+            "Fused vector + full-text (BM25) retrieval; falls back to vector-only if the table "
+            "has no FTS index. Opt-in (default off) — preserves the legacy vector-only default."
+        ),
+    ),
+    output_format: str = typer.Option(
+        "hits",
+        "--format",
+        help=(
+            "'hits' (default): raw ranked hit list (source/page/text/modality/score) — the legacy "
+            "output. 'evidence': answer-ready, fidelity-tagged, cited evidence + coverage (opt-in)."
+        ),
+    ),
+    max_text_chars: int | None = typer.Option(
+        None,
+        "--max-text-chars",
+        help="('hits' format only) Truncate each hit's text to N chars (0 = metadata-only). Default: full text.",
+    ),
 ) -> None:
+    if output_format not in ("hits", "evidence"):
+        typer.echo(f"Error: unknown --format {output_format!r} (use 'hits' or 'evidence').", err=True)
+        raise typer.Exit(1)
+    if max_text_chars is not None and output_format != "hits":
+        typer.echo("Error: --max-text-chars only applies to --format hits.", err=True)
+        raise typer.Exit(1)
     if reranker_invoke_url is None:
         reranker_invoke_url = os.environ.get("RERANKER_INVOKE_URL") or None
     if embed_invoke_url is None:
         embed_invoke_url = os.environ.get("EMBED_INVOKE_URL") or None
     rerank = rerank or bool(reranker_invoke_url) or bool(reranker_model_name) or bool(reranker_backend)
     _silence_noisy_libraries()
+
+    def _run(use_hybrid: bool) -> list:
+        return query_documents(
+            QueryRequest(
+                query=query,
+                retrieval=QueryRetrievalOptions(
+                    top_k=top_k,
+                    candidate_k=candidate_k,
+                    page_dedup=page_dedup,
+                    content_types=content_types,
+                    hybrid=use_hybrid,
+                ),
+                embed=QueryEmbedOptions(
+                    embed_invoke_url=embed_invoke_url,
+                    embed_model_name=embed_model_name,
+                ),
+                rerank=QueryRerankOptions(
+                    enabled=rerank,
+                    reranker_invoke_url=reranker_invoke_url,
+                    reranker_model_name=reranker_model_name,
+                    reranker_backend=reranker_backend,
+                ),
+                storage=QueryStorageOptions(
+                    lancedb_uri=lancedb_uri,
+                    table_name=table_name,
+                ),
+            )
+        )
+
     try:
         with _quiet_capture():
-            hits = query_documents(
-                QueryRequest(
-                    query=query,
-                    retrieval=QueryRetrievalOptions(
-                        top_k=top_k,
-                        candidate_k=candidate_k,
-                        page_dedup=page_dedup,
-                        content_types=content_types,
-                    ),
-                    embed=QueryEmbedOptions(
-                        embed_invoke_url=embed_invoke_url,
-                        embed_model_name=embed_model_name,
-                    ),
-                    rerank=QueryRerankOptions(
-                        enabled=rerank,
-                        reranker_invoke_url=reranker_invoke_url,
-                        reranker_model_name=reranker_model_name,
-                        reranker_backend=reranker_backend,
-                    ),
-                    storage=QueryStorageOptions(
-                        lancedb_uri=lancedb_uri,
-                        table_name=table_name,
-                    ),
-                )
-            )
+            if hybrid:
+                try:
+                    hits = _run(True)
+                    strategies = ["semantic", "lexical"]
+                except Exception:  # noqa: BLE001 — e.g. table has no FTS index; degrade to vector-only
+                    hits = _run(False)
+                    strategies = ["semantic"]
+            else:
+                hits = _run(False)
+                strategies = ["semantic"]
     except _ROOT_CLI_ERRORS as exc:
         typer.echo(f"Error: {exc}", err=True)
         raise typer.Exit(1) from exc
 
-    typer.echo(json.dumps([_query_cli_hit(hit) for hit in hits], indent=2, sort_keys=True, default=str))
+    if output_format == "evidence":
+        result = build_evidence_result(hits, strategies)
+        typer.echo(json.dumps(result, indent=2, sort_keys=True, default=str))
+    else:
+        typer.echo(
+            json.dumps([_query_cli_hit(hit, max_text_chars) for hit in hits], indent=2, sort_keys=True, default=str)
+        )
 
 
 @app.callback()
diff --git a/nemo_retriever/src/nemo_retriever/common/vdb/records.py b/nemo_retriever/src/nemo_retriever/common/vdb/records.py
index bcf8d0091a..2b498036af 100644
--- a/nemo_retriever/src/nemo_retriever/common/vdb/records.py
+++ b/nemo_retriever/src/nemo_retriever/common/vdb/records.py
@@ -66,6 +66,25 @@ def _dict_or_empty(value: Any) -> dict[str, Any]:
     return dict(value) if isinstance(value, dict) else {}
 
 
+def _derive_fidelity(content_type: Any, metadata: dict[str, Any], content_metadata: dict[str, Any]) -> str | None:
+    """Map a chunk's modality + real provenance signals to a trust tier.
+
+    verbatim (PDF text layer) > ocr (scanned/region OCR) > transcribed (ASR) >
+    vlm_caption (chart/image model caption). Returns None for unknown types so
+    the field is omitted rather than guessed.
+    """
+    t = str(content_type or "").lower()
+    if t in ("audio", "video", "video_frame"):
+        return "transcribed"
+    if t == "image":
+        return "ocr" if content_metadata.get("subtype") == "page_image" else "vlm_caption"
+    if t.startswith(("table", "chart", "infographic")):
+        return "ocr"
+    if t == "text":
+        return "ocr" if metadata.get("needs_ocr_for_text") is True else "verbatim"
+    return None
+
+
 def _client_record_from_graph_row(row: dict[str, Any]) -> dict[str, Any] | None:
     metadata = _dict_or_empty(row.get("metadata"))
 
@@ -84,6 +103,9 @@ def _client_record_from_graph_row(row: dict[str, Any]) -> dict[str, Any] | None:
     content_type = row.get("_content_type") or row.get("content_type")
     if content_type:
         content_metadata.setdefault("type", content_type)
+    fidelity = _derive_fidelity(content_type, metadata, content_metadata)
+    if fidelity:
+        content_metadata.setdefault("fidelity", fidelity)
     stored_image_uri = row.get("_stored_image_uri") or row.get("stored_image_uri")
     if stored_image_uri:
         content_metadata.setdefault("stored_image_uri", stored_image_uri)
diff --git a/nemo_retriever/src/nemo_retriever/ingest/plan.py b/nemo_retriever/src/nemo_retriever/ingest/plan.py
index cc6d2ceeeb..e2914b5d63 100644
--- a/nemo_retriever/src/nemo_retriever/ingest/plan.py
+++ b/nemo_retriever/src/nemo_retriever/ingest/plan.py
@@ -201,6 +201,8 @@ class IngestStorageOptions:
     lancedb_uri: str = "lancedb"
     table_name: str = "nemo-retriever"
     overwrite: bool = True
+    # Also build the LanceDB FTS/BM25 index so `query --hybrid` can fuse lexical + vector.
+    hybrid: bool = False
 
 
 @dataclass(frozen=True)
@@ -625,13 +627,18 @@ def resolve_ingest_plan(request: IngestPlanRequest) -> ResolvedIngestPlan:
     )
     extract_params = ExtractParams(**extract_kwargs)
     embed_params = EmbedParams(**embed_kwargs) if embed_kwargs else None
-    vdb_params = VdbUploadParams(
-        vdb_kwargs={
-            "uri": storage.lancedb_uri,
-            "table_name": storage.table_name,
-            "overwrite": bool(storage.overwrite),
-        }
-    )
+    vdb_upload_kwargs = {
+        "uri": storage.lancedb_uri,
+        "table_name": storage.table_name,
+        "overwrite": bool(storage.overwrite),
+    }
+    # `hybrid` is a vdb table-build knob, like `overwrite`/`uri`/`table_name` above: it rides
+    # on storage options and is forwarded as a LanceDB-backend ctor kwarg, where hybrid=True
+    # makes ingest also build the FTS/BM25 index that `query --hybrid` searches. Injected only
+    # when opted in, so vector-only ingests keep the exact legacy vdb_kwargs.
+    if storage.hybrid:
+        vdb_upload_kwargs["hybrid"] = True
+    vdb_params = VdbUploadParams(vdb_kwargs=vdb_upload_kwargs)
     caption_params = _build_caption_params(request.caption)
     dedup_params = _build_dedup_params(request.dedup)
     store_params = _build_store_params(request.image_store)
diff --git a/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py b/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py
index 85364f9cc7..a8f63f9762 100644
--- a/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py
+++ b/nemo_retriever/src/nemo_retriever/models/llm/clients/judge.py
@@ -235,7 +235,9 @@ class LLMJudge:
     """
 
     _DEFAULT_MODEL: str = "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5"
-    _DEFAULT_SAMPLING: LLMInferenceParams = LLMInferenceParams(temperature=0.1, max_tokens=4096)
+    # max_tokens must accommodate the Nemotron reasoning block + the final
+    # {"rating": X}; NVIDIA's llm-judge recipe uses 32768. 4096 truncated mid-think.
+    _DEFAULT_SAMPLING: LLMInferenceParams = LLMInferenceParams(temperature=0.1, max_tokens=32768)
 
     def __init__(
         self,
@@ -296,6 +298,11 @@ def _rate(self, prefix: str, query: str, user_answer: str, reference_answer: str
         ``_get_judge_rating``: retry on an invalid rating or a transport error
         up to ``num_retries`` attempts, then give up with NaN.
         """
+        # The prompt already forbids explanation and demands `{"rating": X}`, and
+        # _parse_rating strips any <think> block — i.e. the judge is designed for the
+        # Nemotron reasoning model. The only requirement is a large enough max_tokens
+        # for the reasoning block to finish and still emit the rating (NVIDIA's
+        # llm-judge recipe uses 32768); 4096 truncated mid-think -> null content.
         messages = [{"role": "user", "content": _render_prompt(prefix, query, user_answer, reference_answer)}]
         attempts = max(1, self.transport.num_retries)
         last_exc: Optional[Exception] = None
diff --git a/nemo_retriever/src/nemo_retriever/query/options.py b/nemo_retriever/src/nemo_retriever/query/options.py
index 331ac82379..74427a6aed 100644
--- a/nemo_retriever/src/nemo_retriever/query/options.py
+++ b/nemo_retriever/src/nemo_retriever/query/options.py
@@ -14,6 +14,9 @@ class QueryRetrievalOptions:
     candidate_k: int | None = None
     page_dedup: bool = False
     content_types: str | Sequence[str] | None = None
+    # Fused vector + full-text (BM25) retrieval. Opt-in (default off) preserves the
+    # legacy vector-only path; requires the LanceDB table to carry an FTS index.
+    hybrid: bool = False
 
 
 @dataclass(frozen=True)
diff --git a/nemo_retriever/src/nemo_retriever/query/workflow.py b/nemo_retriever/src/nemo_retriever/query/workflow.py
index 3a92ed1ea4..8133d11f6d 100644
--- a/nemo_retriever/src/nemo_retriever/query/workflow.py
+++ b/nemo_retriever/src/nemo_retriever/query/workflow.py
@@ -35,12 +35,16 @@ def _build_rerank_kwargs(options: QueryRerankOptions) -> dict[str, str]:
 
 def _build_retriever_kwargs(request: QueryRequest) -> dict[str, Any]:
     embed_kwargs = build_embed_option_kwargs(request.embed.embed_invoke_url, request.embed.embed_model_name)
+    vdb_kwargs: dict[str, Any] = {
+        "uri": request.storage.lancedb_uri,
+        "table_name": request.storage.table_name,
+    }
+    # Only inject hybrid when opted in, so the vector-only path stays byte-for-byte legacy.
+    if request.retrieval.hybrid:
+        vdb_kwargs["hybrid"] = True
     retriever_kwargs: dict[str, Any] = {
         "top_k": request.retrieval.top_k,
-        "vdb_kwargs": {
-            "uri": request.storage.lancedb_uri,
-            "table_name": request.storage.table_name,
-        },
+        "vdb_kwargs": vdb_kwargs,
     }
     if embed_kwargs:
         retriever_kwargs["embed_kwargs"] = embed_kwargs
diff --git a/nemo_retriever/tests/test_fidelity.py b/nemo_retriever/tests/test_fidelity.py
new file mode 100644
index 0000000000..1d8e395d38
--- /dev/null
+++ b/nemo_retriever/tests/test_fidelity.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-26, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from nemo_retriever.common.vdb.records import _client_record_from_graph_row, _derive_fidelity
+
+
+def _fidelity_of(row: dict) -> object:
+    rec = _client_record_from_graph_row(row)
+    assert rec is not None
+    return rec["metadata"]["content_metadata"].get("fidelity")
+
+
+def _row(content_type, *, needs_ocr=None, subtype=None) -> dict:
+    meta: dict = {"embedding": [0.1, 0.2]}
+    if needs_ocr is not None:
+        meta["needs_ocr_for_text"] = needs_ocr
+    cm: dict = {"page_number": 1}
+    if subtype is not None:
+        cm["subtype"] = subtype
+    meta["content_metadata"] = cm
+    return {"text": "x", "metadata": meta, "_content_type": content_type}
+
+
+def test_derive_fidelity_pure_mapping() -> None:
+    assert _derive_fidelity("text", {}, {}) == "verbatim"
+    assert _derive_fidelity("text", {"needs_ocr_for_text": True}, {}) == "ocr"
+    assert _derive_fidelity("image", {}, {}) == "vlm_caption"
+    assert _derive_fidelity("image", {}, {"subtype": "page_image"}) == "ocr"
+    assert _derive_fidelity("table", {}, {}) == "ocr"
+    assert _derive_fidelity("chart_caption", {}, {}) == "ocr"
+    assert _derive_fidelity("audio", {}, {}) == "transcribed"
+    assert _derive_fidelity("video", {}, {}) == "transcribed"
+    assert _derive_fidelity("", {}, {}) is None
+    assert _derive_fidelity("mystery", {}, {}) is None
+
+
+def test_fidelity_stamped_into_stored_record() -> None:
+    assert _fidelity_of(_row("text")) == "verbatim"
+    assert _fidelity_of(_row("text", needs_ocr=True)) == "ocr"
+    assert _fidelity_of(_row("image")) == "vlm_caption"
+    assert _fidelity_of(_row("image", subtype="page_image")) == "ocr"
+    assert _fidelity_of(_row("table")) == "ocr"
+    assert _fidelity_of(_row("audio")) == "transcribed"
diff --git a/nemo_retriever/tests/test_lancedb_retrieval_where.py b/nemo_retriever/tests/test_lancedb_retrieval_where.py
index 15eca336c2..0d614aef2c 100644
--- a/nemo_retriever/tests/test_lancedb_retrieval_where.py
+++ b/nemo_retriever/tests/test_lancedb_retrieval_where.py
@@ -49,7 +49,7 @@ def _tiny_table(uri: str, *, create_fts_index: bool = False) -> None:
 def test_retrieval_where_filters_rows() -> None:
     d = tempfile.mkdtemp()
     _tiny_table(d)
-    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False)
+    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False)
     qv = [1.0, 0.0]
     unfiltered = op.retrieval([qv], top_k=10, table_path=d, table_name="t")
     assert len(unfiltered[0]) == 2
@@ -61,7 +61,7 @@ def test_retrieval_where_filters_rows() -> None:
 def test_retrieval_filter_alias() -> None:
     d = tempfile.mkdtemp()
     _tiny_table(d)
-    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False)
+    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False)
     qv = [1.0, 0.0]
     filtered = op.retrieval([qv], top_k=10, table_path=d, table_name="t", _filter="text = 'beta'")
     assert len(filtered[0]) == 1
@@ -71,7 +71,7 @@ def test_retrieval_filter_alias() -> None:
 def test_retrieval_where_precedence_over_filter() -> None:
     d = tempfile.mkdtemp()
     _tiny_table(d)
-    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False)
+    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False)
     qv = [1.0, 0.0]
     filtered = op.retrieval(
         [qv],
@@ -88,7 +88,7 @@ def test_retrieval_where_precedence_over_filter() -> None:
 def test_retrieval_metadata_like_predicate() -> None:
     d = tempfile.mkdtemp()
     _tiny_table(d)
-    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False)
+    op = LanceDB(uri=d, table_name="t", overwrite=False, vector_dim=2, validate_vector_length=False, hybrid=False)
     qv = [1.0, 0.0]
     pred = '%"doc_id": "x"%'
     filtered = op.retrieval([qv], top_k=10, table_path=d, table_name="t", where=f"metadata LIKE '{pred}'")
diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py
index ea4ddc8e82..ce706c5e9d 100644
--- a/nemo_retriever/tests/test_llm_params.py
+++ b/nemo_retriever/tests/test_llm_params.py
@@ -234,7 +234,7 @@ def test_structured_construction_uses_defaults(self):
         judge = LLMJudge(transport=transport)
         assert judge.model == "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5"
         assert judge.sampling.temperature == 0.1
-        assert judge.sampling.max_tokens == 4096
+        assert judge.sampling.max_tokens == 32768
 
     def test_custom_sampling_override(self):
         from nemo_retriever.models.llm.clients import LLMJudge
@@ -263,7 +263,7 @@ def test_from_kwargs_matches_structured(self):
         assert judge.transport.extra_params == {"user": "t"}
         # Sampling stays at judge defaults even when using flat constructor.
         assert judge.sampling.temperature == 0.1
-        assert judge.sampling.max_tokens == 4096
+        assert judge.sampling.max_tokens == 32768
 
     def test_from_kwargs_accepts_sampling_overrides(self):
         from nemo_retriever.models.llm.clients import LLMJudge
@@ -362,7 +362,7 @@ def test_judging_operator_constructs_cleanly(self):
         op = JudgingOperator(model="nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")
         assert op._judge.model == "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5"
         assert op._judge.sampling.temperature == 0.1
-        assert op._judge.sampling.max_tokens == 4096
+        assert op._judge.sampling.max_tokens == 32768
 
     def test_judging_operator_plumbs_num_retries_to_inner_judge(self):
         """JudgingOperator(num_retries=...) must flow down to the LLMJudge it
diff --git a/nemo_retriever/tests/test_root_cli_workflow.py b/nemo_retriever/tests/test_root_cli_workflow.py
index 6e92fa1a09..c7b7cc9b9a 100644
--- a/nemo_retriever/tests/test_root_cli_workflow.py
+++ b/nemo_retriever/tests/test_root_cli_workflow.py
@@ -1160,3 +1160,25 @@ def fake_quiet_capture() -> Any:
     assert silenced == [True]
     assert captured_use == [True]
     assert "Ingested 1 file(s) → 3 row(s) in LanceDB lancedb/nemo-retriever." in result.output
+
+
+def test_root_ingest_passes_hybrid_into_vdb_kwargs(monkeypatch, tmp_path) -> None:
+    fake_ingestor = _make_fake_ingestor()
+    doc = tmp_path / "a.pdf"
+    doc.write_bytes(b"%PDF-1.4\n")
+
+    monkeypatch.setattr(ingest_execution, "create_ingestor", lambda **_: fake_ingestor)
+    monkeypatch.setattr(ingest_execution, "_count_lancedb_rows", lambda *_, **__: 1)
+
+    result = RUNNER.invoke(
+        cli_main.app,
+        ["ingest", str(doc), "--lancedb-uri", "/tmp/lancedb", "--table-name", "docs", "--hybrid"],
+    )
+
+    assert result.exit_code == 0
+    assert fake_ingestor.vdb_upload.call_args.args[0].vdb_kwargs == {
+        "uri": "/tmp/lancedb",
+        "table_name": "docs",
+        "overwrite": True,
+        "hybrid": True,
+    }
diff --git a/nemo_retriever/tests/test_root_query_cli.py b/nemo_retriever/tests/test_root_query_cli.py
index 45715b2859..ba8ca528a9 100644
--- a/nemo_retriever/tests/test_root_query_cli.py
+++ b/nemo_retriever/tests/test_root_query_cli.py
@@ -37,8 +37,8 @@ def test_root_query_passes_query_options_and_prints_json(monkeypatch) -> None:
         },
     ]
     expected_output = [
-        {"source": "doc.pdf", "page_number": 1, "text": "passage"},
-        {"source": "other.pdf", "page_number": 2, "text": "other"},
+        {"source": "doc.pdf", "page_number": 1, "text": "passage", "modality": "text", "score": 0.2},
+        {"source": "other.pdf", "page_number": 2, "text": "other", "modality": "table", "score": 0.4},
     ]
 
     class FakeRetriever:
@@ -108,7 +108,7 @@ def query(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:
     assert result.exit_code == 0
     assert query_kwargs == [{"candidate_k": 3, "page_dedup": True, "content_types": "text,table"}]
     assert json.loads(result.output) == [
-        {"page_number": 1, "source": "doc.pdf", "text": "text row"},
+        {"page_number": 1, "source": "doc.pdf", "text": "text row", "modality": "text", "score": None},
     ]
 
 
@@ -277,3 +277,52 @@ def fail_query_documents(*_args: Any, **_kwargs: Any) -> list[dict[str, Any]]:
 
     assert result.exit_code == 1
     assert "Error: database unavailable" in result.output
+
+
+def test_root_query_passes_hybrid_into_vdb_kwargs(monkeypatch) -> None:
+    retriever_calls: list[dict[str, Any]] = []
+
+    class FakeRetriever:
+        def __init__(self, **kwargs: Any) -> None:
+            retriever_calls.append(kwargs)
+
+        def query(self, query: str, **_kwargs: Any) -> list[dict[str, Any]]:
+            return []
+
+    monkeypatch.setattr(query_core, "Retriever", FakeRetriever)
+
+    result = RUNNER.invoke(
+        cli_main.app,
+        ["query", "q", "--top-k", "5", "--lancedb-uri", "/tmp/lancedb", "--table-name", "docs", "--hybrid"],
+    )
+
+    assert result.exit_code == 0
+    assert retriever_calls == [
+        {"top_k": 5, "vdb_kwargs": {"uri": "/tmp/lancedb", "table_name": "docs", "hybrid": True}}
+    ]
+
+
+def test_root_query_max_text_chars_truncates_and_omits(monkeypatch) -> None:
+    hits = [{"text": "abcdefghij", "source": "d.pdf", "page_number": 1, "metadata": {"type": "text"}, "_distance": 0.1}]
+
+    class FakeRetriever:
+        def __init__(self, **_: Any) -> None:
+            pass
+
+        def query(self, query: str, **_kwargs: Any) -> list[dict[str, Any]]:
+            return hits
+
+    monkeypatch.setattr(query_core, "Retriever", FakeRetriever)
+
+    snip = RUNNER.invoke(cli_main.app, ["query", "q", "--max-text-chars", "5"])
+    assert snip.exit_code == 0
+    snip_hit = json.loads(snip.output)[0]
+    assert snip_hit["text"] == "abcde…"
+    assert snip_hit["modality"] == "text"  # non-text fields intact
+    assert snip_hit["source"] == "d.pdf"
+
+    meta = RUNNER.invoke(cli_main.app, ["query", "q", "--max-text-chars", "0"])
+    meta_hit = json.loads(meta.output)[0]
+    assert meta_hit["text"] == ""
+    assert meta_hit["source"] == "d.pdf"
+    assert meta_hit["page_number"] == 1
diff --git a/skills/nemo-retriever/SKILL.md b/skills/nemo-retriever/SKILL.md
index 1292e65c4d..e3ef29c6fd 100644
--- a/skills/nemo-retriever/SKILL.md
+++ b/skills/nemo-retriever/SKILL.md
@@ -9,7 +9,7 @@ allowed-tools: Bash Write Read
 
 The `retriever` CLI indexes a folder of PDFs into LanceDB (`retriever ingest`) and serves vector search over it (`retriever query`). For any task about searching/answering questions across a folder of PDFs, use this CLI — do not write a custom RAG.
 
-**Beyond PDFs and beyond semantic search.** `retriever ingest` also handles images, Office, HTML, TXT, audio, and video — see `references/setup.md` for the per-format recipe and `references/install.md` for the install extras (`[multimedia]`, libreoffice, ffmpeg). For non-semantic operations — page filter, verbatim quote with citation, corpus-level aggregate, chart/image caption hits — see `references/query.md`. Don't fall back to native Read/Grep/Python on non-PDF inputs.
+**Beyond PDFs and beyond semantic search.** `retriever ingest` also handles images, Office, HTML, TXT, audio, and video — see `references/setup.md` for the per-format recipe and `references/install.md` for the install extras (`[multimedia]`, libreoffice, ffmpeg). The query turn is a single command — see **§Query turn** below (inline, no reference read needed); `references/cli/query.md` holds only the fallback detail (exact-term, chart text-extract, compose-reply). Don't fall back to native Read/Grep/Python on non-PDF inputs.
 
 ## Install (if `retriever` is missing)
 
@@ -20,17 +20,27 @@ If `command -v retriever` returns nothing, follow `references/install.md` to ins
 | Turn type | Read this once | Then execute |
 | :--- | :--- | :--- |
 | **Setup turn** (first turn — `./lancedb/nemo-retriever.lance` doesn't exist) | `references/setup.md` | Build the index |
-| **Query turn** (every subsequent turn — user asks a question) | `references/query.md` | One `retriever query` call, then `Write` `./output.json` *(eval-harness contract only — for general use, just answer in chat; see `query.md` top callout)* |
+| **Query turn** (every subsequent turn — user asks a question) | **§Query turn** below (command inline — no reference read needed) | Run it, then `Write` `./output.json` *(eval-harness contract only — for general use, just answer in chat)* |
 | Anything errored or returned empty | `references/troubleshooting.md` | Apply the named recovery; do not improvise |
 
-For the full `retriever ingest` / `retriever query` CLI specs, see `references/cli/ingest.md` and `references/cli/query.md`. You do not need these for routine turns — `<RETRIEVER_VENV>/bin/retriever <subcommand> --help` is faster.
+## Query turn — run this, then write the answer
+
+`<RETRIEVER_VENV>/bin/retriever query "<question>" --format evidence --hybrid --top-k 10` → JSON
+`{ evidence: [ { text, source, locator, modality, fidelity, score, citation } ], coverage: {...} }`.
+That's the FIRST (usually only) call — don't `ls`/`find`/`sed`/Read to orient first; it already searched the whole corpus. Then:
+- **Lead with the direct answer** (the exact figure, or Yes/No) for the exact entity asked; address every entity / year / category the question names — even "not provided".
+- **Trust by fidelity** (`verbatim > ocr > transcribed > vlm_caption`): a number or directional claim resting ONLY on a `vlm_caption` (chart/image) is unconfirmed — quote it tagged "(chart-derived, unconfirmed)" unless a higher-fidelity item states the same fact. Never fabricate from adjacent text.
+- Re-`query` only if the answer isn't yet supported — once per genuinely distinct sub-question (per entity when comparing/listing), or with the exact term when `coverage.thin_spots` flags a miss.
+- Open `references/cli/query.md` ONLY for the fallback path (exact-term re-query, chart text-extract, compose-reply detail) — a normal answer needs none of it.
+
+For the full `retriever ingest` CLI spec, see `references/cli/ingest.md`. For `retriever query` flags, `<RETRIEVER_VENV>/bin/retriever query --help` is authoritative (and faster) — you do not need it for routine turns.
 
 Before ingesting a mixed folder, inventory extensions (`find <dir> -name '*.*' | sed 's/.*\.//' | sort -u`) — `--input-type=auto` silently drops anything outside the supported set. See `references/troubleshooting.md` "Unsupported file types".
 
 ## Hard limits (apply to every turn)
 
 - **Setup turn**: build the index in one shell command (see `references/setup.md`). STOP after the index lands.
-- **Query turn**: at most **2 Bash calls** — 1 `retriever query`, +1 optional targeted text-extract per `references/query.md`. Reply and then STOP.
+- **Query turn**: at most **2 Bash calls** — 1 `retriever query`, +1 optional targeted text-extract per `references/cli/query.md`. Reply and then STOP.
 - **No narration between tool calls.** Tokens you emit between calls become input + cached input for every later turn — quadratic cost. Go straight from reading the summary to writing the JSON file.
 - **Banned**: `TodoWrite`, Glob, Grep, `Read` of whole PDFs, re-running setup, spawning subagents, speculative "confirmation" calls.
 
diff --git a/skills/nemo-retriever/contract/CONTRACT.md b/skills/nemo-retriever/contract/CONTRACT.md
new file mode 100644
index 0000000000..14c56f0e9d
--- /dev/null
+++ b/skills/nemo-retriever/contract/CONTRACT.md
@@ -0,0 +1,47 @@
+# retriever skill↔engine contract
+
+`contract_version` (see `cli-contract.json`) is the semver the **skill** asserts
+about the installed **engine**. Run `scripts/doctor.py` to verify the installed
+`retriever` satisfies it.
+
+The skill's one primitive is **`retriever query <question> --format evidence --hybrid`** →
+`{ evidence, coverage }`. The `query` engine defaults are `--format hits` (a flat ranked
+list) and vector-only (`--hybrid` off); the skill opts into `--format evidence`
+(fidelity-tagged evidence + coverage) and `--hybrid` (vector+BM25) **explicitly**, so
+plain `query` callers are unaffected. `query` *also* exposes `--rerank`, `--candidate-k`,
+`--content-types`, `--page-dedup` (unused by the skill); the contract gates the skill's
+invocation + result shape, not the full flag surface.
+
+## Files
+- `cli-contract.json` — the gated surface: required subcommands, `query`'s required
+  flags + default format/hybrid, and `ingest`'s flags. `default_table_name` is the
+  engine's table-name constant (operator config), not the skill name.
+- `query-result.schema.json` — the shape `retriever query --format evidence` emits and the
+  skill reasons over: `evidence[]` (each with `text, source, locator, modality,
+  fidelity, score, citation`) + `coverage`. This is THE contract the skill relies on.
+
+## Versioning
+- Bump **patch** for clarifications, **minor** for additive engine capabilities the
+  skill can use, **major** when the engine changes something the skill relies on
+  (a `query` evidence/coverage field, the default `--format`/`--hybrid` behavior, or
+  the gated primitive). A major bump means the skill must be updated in the same change.
+- `doctor.py` fails if the installed engine no longer matches `cli-contract.json` /
+  `query-result.schema.json`.
+
+## How drift gets caught
+`doctor.py` runs on the skill's setup turn. It
+performs a LIVE probe — ingest a tiny built-in document, run `retriever query --format evidence`,
+validate `{evidence, coverage}` (including the `fidelity` enum)
+against `query-result.schema.json` — plus static `--help` checks: the required
+subcommands (`ingest`, `query`) exist and `query` exposes its required
+flags (`--top-k`, `--hybrid`, `--format`). Any divergence (a renamed evidence field, a
+missing `fidelity`, a dropped `--format`, `--input-type` reappearing on `ingest`) fails
+loudly with a remediation hint.
+
+## Changelog
+- **0.1.0** — skill-first contract built around **`retriever query --format evidence --hybrid`**
+  → `{evidence, coverage}` (validated against `query-result.schema.json`). The gated
+  subcommands are `ingest` and `query`; `query`'s engine defaults are `--format hits` and
+  vector-only, and the skill passes `--format evidence`/`--hybrid` explicitly.
+  `query` may expose extra knobs (`--rerank`, `--candidate-k`, …) — they're allowed but unused
+  by the skill, so the contract gates the invocation + result shape, not the full flag surface.
diff --git a/skills/nemo-retriever/contract/cli-contract.json b/skills/nemo-retriever/contract/cli-contract.json
new file mode 100644
index 0000000000..4332ce3813
--- /dev/null
+++ b/skills/nemo-retriever/contract/cli-contract.json
@@ -0,0 +1,20 @@
+{
+  "contract_version": "0.1.0",
+  "primitive": "query",
+  "subcommands_required": ["ingest", "query"],
+  "requires_hybrid_index": true,
+  "query": {
+    "required_flags": ["--top-k", "--hybrid", "--format"],
+    "operator_flags": ["--lancedb-uri", "--table-name", "--embed-model-name"],
+    "skill_invocation": "retriever query \"<question>\" --format evidence --hybrid",
+    "engine_default_format": "hits",
+    "engine_default_hybrid": false,
+    "result_schema": "query-result.schema.json"
+  },
+  "ingest": {
+    "required_flags": ["--append", "--overwrite", "--hybrid", "--ocr-version", "--ocr-lang", "--table-name", "--lancedb-uri", "--embed-model-name"],
+    "forbidden_flags": ["--input-type"],
+    "single_pass_multiformat": true
+  },
+  "default_table_name": "nemo-retriever"
+}
diff --git a/skills/nemo-retriever/contract/query-result.schema.json b/skills/nemo-retriever/contract/query-result.schema.json
new file mode 100644
index 0000000000..ec7d742a87
--- /dev/null
+++ b/skills/nemo-retriever/contract/query-result.schema.json
@@ -0,0 +1,39 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "retriever/query-result",
+  "title": "retriever query --format evidence result (engine, contract 0.1.0)",
+  "description": "What `retriever query <question> --format evidence` emits and the skill reasons over (query's default output is the legacy `--format hits` flat list).",
+  "type": "object",
+  "required": ["evidence", "coverage"],
+  "properties": {
+    "evidence": { "type": "array", "items": { "$ref": "#/$defs/evidence_item" } },
+    "coverage": { "$ref": "#/$defs/coverage" }
+  },
+  "additionalProperties": true,
+  "$defs": {
+    "evidence_item": {
+      "type": "object",
+      "required": ["text", "source", "locator", "modality", "fidelity", "score", "citation"],
+      "properties": {
+        "text": { "type": "string" },
+        "source": { "type": "string", "description": "source basename (.pdf stripped)" },
+        "locator": { "type": "object", "description": "{ kind: page|segment|timestamp|bbox, value }" },
+        "modality": { "type": "string", "description": "text | table | chart | image | audio | video_frame" },
+        "fidelity": { "type": "string", "enum": ["verbatim", "ocr", "transcribed", "vlm_caption"] },
+        "score": { "type": "number", "description": "relevance/distance; hit ORDER is authoritative" },
+        "citation": { "type": "string", "description": "source + locator, ready to cite" }
+      },
+      "additionalProperties": true
+    },
+    "coverage": {
+      "type": "object",
+      "required": ["strategies_used", "n_docs_seen", "thin_spots"],
+      "properties": {
+        "strategies_used": { "type": "array" },
+        "n_docs_seen": { "type": "integer" },
+        "thin_spots": { "type": "array" }
+      },
+      "additionalProperties": true
+    }
+  }
+}
diff --git a/skills/nemo-retriever/references/cli/query.md b/skills/nemo-retriever/references/cli/query.md
index dc9ed5309d..5f7587183f 100644
--- a/skills/nemo-retriever/references/cli/query.md
+++ b/skills/nemo-retriever/references/cli/query.md
@@ -1,91 +1,34 @@
-# retriever query
-
-Embed a text query and return the top-k nearest rows from a LanceDB table
-previously written by `retriever ingest` (or any compatible pipeline).
-
-If flags below look stale, re-check `retriever query --help`.
-
-## When to use this
-
-- You have already ingested documents and want to retrieve relevant
-  chunks/primitives for a natural-language query.
-- You want a one-shot CLI lookup — no service, no UI.
-
-**Use a different command when:**
-
-- You want recall metrics over a labelled query set → `retriever recall`.
-- You want to grade end-to-end QA quality → `retriever eval`.
-- You want a long-running query endpoint → `retriever service`.
-- You want to compare two retrieval runs → `retriever compare`.
-
-## Canonical invocations
-
-Top-10 search against the default table:
+# Query turn — the WHOLE workflow
 
 ```bash
-<RETRIEVER_VENV>/bin/retriever query "what is in chart 1?"
+timeout 2000 <RETRIEVER_VENV>/bin/retriever query "<the user's question>" --format evidence --hybrid --top-k 10 \
+  --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --query-embed-backend hf \
+  | tee ./evidence.json
 ```
 
-Top-3, custom table:
+That's your FIRST tool call on every query turn, run **exactly** as one pipeline (cold runs take ~20–30s; wait for it — don't background it or fire parallel queries). Do not Read, Glob, Grep, or list PDFs first — those duplicate what `retriever query` already did. `--format evidence` returns answer-ready JSON:
 
-```bash
-<RETRIEVER_VENV>/bin/retriever query "average frequency ranges for tweeters" \
-  --top-k 3 \
-  --lancedb-uri ./my-lancedb \
-  --table-name my-corpus
+```
+{ "evidence": [ { text, source, locator, modality, fidelity, score, citation } ], "coverage": {...} }
 ```
 
-## Inputs
-
-- **Positional `QUERY`** — single text string. Required. Quote it in the shell
-  to keep multi-word queries intact.
-
-## Outputs
+`tee ./evidence.json` keeps the full result in the cwd (not `/tmp` — clobbered under parallel queries). Read it back only as needed (`<RETRIEVER_VENV>/bin/python -c "import json; print(json.load(open('./evidence.json'))['evidence'][0]['text'])"`); pulling all chunks' text into context inflates cached prompt size on every later turn.
 
-- JSON array on stdout, one object per hit, in retriever ranking order.
-- The root CLI intentionally returns compact objects:
-  - `source` — origin document path.
-  - `page_number` — 1-indexed page when available.
-  - `text` — retrieved primitive text, table text, chart text, or image caption.
-- Internal scores, raw metadata, and bounding boxes are available from the Python
-  `Retriever.query(...)` API, not the public root CLI output.
+**No narration between tool calls.** Do not write "Let me search…", "The retriever returned…", or any commentary — every token between the `query` call and the `Write` of `./output.json` becomes input (and cached input) for every later turn (quadratic cost). Go straight from reading the result to writing the file.
 
-## Key flags
+Each evidence item carries: `text`, `source` (doc basename), `locator` (`{kind: page, value: <int, 1-indexed>}`), `modality` (`text|table|chart|image|audio|video_frame`), **`fidelity`** (`verbatim > ocr > transcribed > vlm_caption`), `score`, and `citation` (ready-to-quote source + locator). Hit ORDER is authoritative; `score` is informational.
 
-| Flag | Default | Notes |
-|---|---|---|
-| `--top-k` | `10` | Final number of hits to return. Must be >= 1. |
-| `--candidate-k` | unset | Wider pre-filter/pre-dedup candidate pool. When set, it must be >= `--top-k`; make it larger when `--page-dedup` or `--content-types` could reduce final hits. |
-| `--page-dedup` | `false` | Collapse results to unique document pages. |
-| `--content-types` | unset | Comma-separated content types to keep, such as `text,table` or `image,chart`; query-time values are normalized to canonical hit metadata types, `images` is accepted as an alias for captioned image rows, and untyped hits are excluded. |
-| `--lancedb-uri` | `lancedb` | Must match what `ingest` wrote to. |
-| `--table-name` | `nemo-retriever` | Must match what `ingest` wrote to. |
+## Trust by fidelity — the core of a correct answer
 
-## Ranking interpretation
+A number or directional claim resting ONLY on a `vlm_caption` (chart/image transcription) is **unconfirmed** — chart transcriptions often flip direction words (`increase`↔`decrease`) or misread exact figures. Prefer `verbatim`/`ocr`/`table` evidence for exact values. If the figure you need appears only in a `vlm_caption`, quote it verbatim and tag "(chart-derived, unconfirmed)" unless a higher-fidelity item states the same fact. Never upgrade a low-fidelity reading to a confident fact.
 
-- The embedder (`llama-nemotron-embed-vl-1b-v2`) returns mean-pooled vectors;
-  LanceDB ranks by L2 distance by default. The root CLI hides raw distance values;
-  treat result order as ranking-only, not calibrated confidence.
-- The query uses the **VL** variant of the embedder so text queries can match
-  ingested image/chart embeddings as well as text. Expect mixed-modality hits
-  in the result list.
+## When the answer isn't in the first result
 
-## Common failure modes
+Re-`query` only when the top evidence doesn't yet answer — for a genuinely *distinct* sub-question (per entity when comparing/listing), or **with the exact term/phrase** when `coverage.thin_spots` flags a miss or a specific ID/code/figure isn't in the returned text (the fused BM25 leg matches exact strings semantic search skips — e.g. re-query `"mRNA-1273"` to surface every chunk that names it). Read `coverage.thin_spots` to tell "broaden the search" from "out of corpus". Do NOT re-issue reworded variants of the same question, reach for `pdftotext`/`pdfgrep`, or open the LanceDB table yourself — `query` already searched the whole corpus.
 
-- **Empty result array** — table is empty (no ingest run yet) or
-  `--table-name` / `--lancedb-uri` don't match where ingest wrote.
-- **`Table 'nemo-retriever' was not found`** — same root cause: wrong table/URI,
-  or ingest hasn't been run.
-- **First query is slow (~10–15s)** — vLLM startup for the query embedder.
-  Subsequent queries in the same process are sub-second; one-shot CLI
-  invocations always pay this cost.
-- **Surprisingly low-relevance top hit** — for very short corpora, even
-  unrelated queries return *something*. Broaden with `--candidate-k`, use
-  `--page-dedup` for page diversity, or use `--content-types` for targeted
-  table/chart/image-caption searches.
+## Compose your reply from the evidence
 
-## Related
+- `final_answer`: **lead with the direct answer** — the exact figure (in the evidence's own units) or a bare Yes/No, for the exact entity asked — then support it. Synthesize from the evidence `text`. One paragraph, no restating the question, no hedging caveats. **Re-read the question**: address every entity / year / category it names, even those the evidence marks "not provided" (missing entities lose more judge points than imprecise numbers). If the asked-for fact isn't in the evidence, say so explicitly — never invent or extrapolate from adjacent material.
+- `ranked_retrieved`: one entry per evidence item in returned order: `{"doc_id": "<source>", "page_number": <locator.value>, "rank": <i+1>}`. Up to 10. **Indexing:** `locator.value` is 1-indexed; if the task's schema says 0-indexed, emit `value - 1`, else emit as-is.
 
-- [[ingest]] — populate the table this command reads.
-- `retriever recall --help` — batch query → recall@k against ground truth.
-- `retriever eval --help` — end-to-end QA evaluation.
+After your reply, STOP. No print, no summary, no further tool calls.
diff --git a/skills/nemo-retriever/references/query.md b/skills/nemo-retriever/references/query.md
deleted file mode 100644
index 8d451d837d..0000000000
--- a/skills/nemo-retriever/references/query.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Query turn — the WHOLE workflow
-
-
-```bash
-timeout 2000 <RETRIEVER_VENV>/bin/retriever query "<the user's question>" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --query-embed-backend hf --reranker-backend hf --rerank \
-  | tee ./hits.json \
-  | <RETRIEVER_VENV>/bin/python -c "import json,sys,os; [print(f'rank={i+1} page={h[\"page_number\"]} doc={os.path.basename(h[\"source\"])}') for i,h in enumerate(json.load(sys.stdin))]"
-```
-
-Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | <RETRIEVER_VENV>/bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full hits land in `./hits.json` **in the current working directory** (not `/tmp` — a shared `/tmp` path gets clobbered when queries run in parallel). The summary above lists rank/page/doc — to read hit text for synthesizing `final_answer`, parse `./hits.json` directly. The top hit's text is one one-liner away: `<RETRIEVER_VENV>/bin/python -c "import json; print(json.load(open('./hits.json'))[0]['text'])"` (or `[i]` for the rank-(i+1) hit). Fetch only what you need — pulling all 10 hits' text into context inflates cached prompt size on every subsequent turn.
-
-That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did.
-
-`--query-embed-backend hf` and `--reranker-backend hf` run the query embedder and reranker via HuggingFace instead of vLLM: a single query then loads in ~20–30s (vLLM's batch engine cold-starts much slower and hogs GPU memory). Same model, same hits — just a faster, lighter cold start for one-off queries. (Ingest still uses vLLM for batch throughput.)
-
-**No narration between tool calls.** Do not write "Let me search…", "I'll now analyze…", "The retriever returned…", or any other commentary. Every assistant token you emit between the `retriever query` Bash call and the `Write` of `./output.json` becomes input tokens (and cached input tokens) for every subsequent turn in this session — quadratic cost. Go straight from reading the summary to writing the JSON file. The only assistant text in a query turn should be the tool calls themselves.
-
-Each hit has exactly three keys: `source` (the **full PDF path** — the doc_id is its basename, `os.path.basename(h["source"])[:-4]` to drop `.pdf`), `page_number` (int, **1-indexed**: the first page of a PDF is page `1`), and `text`. There is no `pdf_basename`, `metadata`, `pdf_page`, or `_distance` field — referencing those raises `KeyError`.
-
-## Keyword/regex search across the corpus
-
-If you need exact text matches that semantic `retriever query` may have skipped — e.g. "find every mention of 'mRNA-1273' across all PDFs" — use:
-
-```bash
-<RETRIEVER_VENV>/bin/python <skill_dir>/scripts/grep_corpus.py "<regex>" [--max-hits 50]
-```
-
-It scans the LanceDB table the retriever already built — no PDF re-extraction. Output is `<pdf>:p<page>:<type>:  ...<snippet>...` per hit; `NO_MATCH` if nothing. Counts against the same "one optional follow-up call" budget as the targeted text-extract (mutually exclusive — pick one).
-
-Don't reach for `pdftotext`, `pdftohtml`, or `pdfgrep` — they're system tools that aren't guaranteed installed on the user's machine. The retriever venv bundles pdfium and `lancedb`; `grep_corpus.py` and `retriever pdf stage page-elements --method pdfium` cover the same use cases without that dependency.
-
-## Compose your reply from the hits
-
-- `final_answer`: synthesize from the top hits' `text`. Include the exact number / name / date / row / column the question asks for, plus the source PDF and 0-indexed page. One paragraph. No restating the question, no hedging caveats. If the chunks talk *around* the fact but don't state it, run ONE `<RETRIEVER_VENV>/bin/retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json` and `Read` `/tmp/pdf_text/<top_pdf>.pdf.pdf_extraction.json` for the rank-1 page (or rank-2 if rank-1 is metadata) — that almost always surfaces the exact figure. Then synthesize. **If after both calls the asked-for fact still isn't in the evidence, write `final_answer` that says so explicitly** — e.g. "The retrieved pages do not state [X] for [entity]; the closest content is [Y]." Do NOT invent, extrapolate, or generate plausible-sounding content from adjacent material. A confidently-wrong answer scores worse than an honest "not in the retrieved pages".
-- `ranked_retrieved`: one entry per hit in the order `retriever query` returned: `{"doc_id": "<pdf_basename without .pdf>", "page_number": <int>, "rank": <i+1>}`. Up to 10. Duplicate `(doc, page)` is fine. **Indexing:** the retriever's `page_number` is 1-indexed. If the task's output schema says 0-indexed (e.g. "first page is page 0"), emit `hit.page_number - 1`; if the task says 1-indexed or doesn't specify, emit `hit.page_number` as-is.
-
-**Before writing `final_answer`, re-read the question.** If it lists multiple entities, years, or categories, your answer must address each one explicitly — even if for some of them the chunks say "not provided" or contain no data. Missing entities lose more judge points than imprecise numbers.
-
-## Charts and images — the single biggest source of judge=2/3 trials
-
-When `metadata.type` of a hit is `chart` or `image`, its `text` field is a model-generated transcription that frequently:
-
-- reverses direction words (`increase`↔`decrease`, `rose`↔`fell`, `surge`↔`drop`), and
-- rounds or misreads exact percentages (e.g. transcribing 12% as 20%).
-
-If a question asks for an exact percentage or a directional claim **and the evidence is only a chart/image hit** (no `text`-type hit corroborates the same number or direction):
-
-1. Run the targeted `<RETRIEVER_VENV>/bin/retriever pdf stage page-elements --method pdfium` text-extract on the rank-1 PDF (this counts as your second tool call) and look for the number in prose.
-2. If prose confirms the chart number, assert it confidently.
-3. If prose doesn't mention it, **quote the chart transcription verbatim with an explicit hedge in `final_answer`**: "The chart on page N indicates [verbatim phrase] (chart-derived, not verified against prose)." Do NOT restate the chart's number as a confident fact.
-
-When both a chart hit and a text hit cover the same fact, always prefer the text hit's number.
-After your reply, STOP. No print, no summary, no further tool calls.
-
-## Non-semantic operations (use these, don't fall back to native tools)
-
-**Page filter** — "what's on page N of doc.pdf" → filter LanceDB directly, no `Read`:
-
-```bash
-<RETRIEVER_VENV>/bin/python -c "import lancedb,json; df=lancedb.connect('./lancedb').open_table('nemo-retriever').to_pandas(); print('\n'.join(r['text'] for _,r in df.iterrows() if json.loads(r['source'])['source_name']=='APPLE_2022_10K.pdf' and json.loads(r['metadata'])['page_number']==14))"
-```
-
-**Verbatim quote with `[page]` citation** — quote retrieved chunks with `[page N]` markers in `final_answer`; don't paraphrase.
-
-**Corpus-level aggregate** — "list distinct sources", "count chunks per source" → no `ls`/`grep`/`find`:
-
-```bash
-<RETRIEVER_VENV>/bin/python -c "import lancedb,json; from collections import Counter; df=lancedb.connect('./lancedb').open_table('nemo-retriever').to_pandas(); names=[json.loads(s)['source_name'] for s in df['source']]; print(sorted(set(names))); print(dict(Counter(names)))"
-```
-
-**Image / chart captioning** — when the user asks to *describe / caption* an image (prose summary, not OCR text): `retriever ingest` already produces chart/image-type hits whose `text` field is the model-generated caption (see "Charts and images" above). Workflow: ingest the image folder (`setup.md` image recipe), then `retriever query` with a topic-related question — the hits with `metadata.type=chart|image` carry the caption in `text`. Use that as `final_answer`. No separate captioning CLI command.
diff --git a/skills/nemo-retriever/references/setup.md b/skills/nemo-retriever/references/setup.md
index 18f30fa80a..18dd0740c0 100644
--- a/skills/nemo-retriever/references/setup.md
+++ b/skills/nemo-retriever/references/setup.md
@@ -6,13 +6,13 @@
 TOTAL_PAGES=$(<RETRIEVER_VENV>/bin/python -c "import pypdfium2, glob; print(sum(len(pypdfium2.PdfDocument(p)) for p in glob.glob('./pdfs/*.pdf')))" 2>/dev/null || echo 0)
 echo "total_pages=$TOTAL_PAGES"
 if [ "$TOTAL_PAGES" -le 50000 ]; then
-  <RETRIEVER_VENV>/bin/retriever ingest ./pdfs/ --embed-model-name nvidia/llama-nemotron-embed-1b-v2
+  <RETRIEVER_VENV>/bin/retriever ingest ./pdfs/ --hybrid --embed-model-name nvidia/llama-nemotron-embed-1b-v2
 else
-  <RETRIEVER_VENV>/bin/retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever"}' --quiet
+  <RETRIEVER_VENV>/bin/retriever pipeline run ./pdfs/ --run-mode inprocess --method pdfium --no-extract-tables --no-extract-charts --no-extract-page-as-image --evaluation-mode none --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever", "hybrid": true}' --quiet
 fi
 ```
 
-Both branches write the **same** LanceDB table, `lancedb/nemo-retriever` — the table `retriever query` reads by default. `retriever ingest` defaults to that table automatically; `retriever pipeline run` has no `--table-name` flag and would otherwise default to `nv-ingest`, so the `else` branch pins it with `--vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever"}'`. Keep these aligned or queries will read an empty table.
+`--hybrid` (and `"hybrid": true` in the pipeline branch's `--vdb-kwargs-json`) builds a full-text (BM25) index alongside the vectors, so `retriever query`'s fused lexical leg can match exact terms (IDs, codes, rare phrases); without it, hybrid queries silently fall back to vector-only. Both branches write the **same** LanceDB table, `lancedb/nemo-retriever` — the table `retriever query` reads by default. `retriever ingest` defaults to that table automatically; `retriever pipeline run` has no `--table-name` flag and would otherwise default to `nv-ingest`, so the `else` branch pins it with `--vdb-kwargs-json '{"uri": "lancedb", "table_name": "nemo-retriever", "hybrid": true}'`. Keep these aligned or queries will read an empty table.
 
 `retriever ingest` is quiet by default; the `else` (`retriever pipeline run`) branch needs `--quiet` passed explicitly. Quiet mode suppresses progress bars, HuggingFace download logs, vLLM init noise, Ray worker stdout, and INFO-level pipeline status lines on success, while still flushing captured output to stderr on error. Without it the `pipeline run` branch burns thousands of tokens on irrelevant progress output. On success you only see one line: `Ingested N document(s) into LanceDB lancedb/nemo-retriever.` (for `retriever ingest`) or `Pipeline complete: N page(s) → lancedb lancedb/nemo-retriever (T.Ts).` (for `retriever pipeline run`).
 
@@ -24,7 +24,7 @@ After the setup command returns successfully, STOP. Don't run smoke queries to "
 
 ## Other input shapes
 
-Same `retriever ingest` command, different `--input-type` and (for non-PDF) install extras. Install extras live in `references/install.md` "Optional extras".
+Same `retriever ingest` command, different `--input-type` and (for non-PDF) install extras. Add `--hybrid` to each (as in the PDF recipe) so exact-term search works. Install extras live in `references/install.md` "Optional extras".
 
 **Images / scanned forms / charts** (`.jpg` `.png` `.tiff` `.bmp`):
 
diff --git a/skills/nemo-retriever/references/troubleshooting.md b/skills/nemo-retriever/references/troubleshooting.md
index eaf93188b6..e210e6ac67 100644
--- a/skills/nemo-retriever/references/troubleshooting.md
+++ b/skills/nemo-retriever/references/troubleshooting.md
@@ -2,7 +2,7 @@
 
 Read this only after you hit one of the named errors below. Don't read it pre-emptively.
 
-## If the index is missing or `retriever query` returns `[]`
+## If the index is missing or `retriever query` returns empty `evidence`
 
 Means ingest didn't complete (e.g. the text-only pipeline still hit the turn wall, or the table is empty). Tight fallback using the retriever's own pdfium-based extractor (always available — same binary the agent just used for `retriever query`):
 
@@ -18,10 +18,10 @@ For an unlisted subcommand: `<RETRIEVER_VENV>/bin/retriever <subcommand> --help`
 ## Failure modes (expected, not errors)
 
 - **First `ingest` takes ~60s+** — vLLM warmup. Expected.
-- **First `query` is slow** — embedder (and reranker, with `--rerank`) cold-start. ~10–15s on an idle GPU, but **1–3 minutes under concurrent load**. Expected — wait for it; do not kill or relaunch. It is wrapped in `timeout 2000`, so let it run to that ceiling before treating it as failed.
-- **Empty result** — ingest didn't run. Use the fallback above.
+- **First `query` is slow** — embedder cold-start. ~10–15s on an idle GPU, but **1–3 minutes under concurrent load**. Expected — wait for it; do not kill or relaunch. It is wrapped in `timeout 2000`, so let it run to that ceiling before treating it as failed.
+- **Empty `evidence`** — ingest didn't run (use the fallback above), or the question is genuinely out-of-corpus — read `coverage.thin_spots` to tell which.
 - **`Clamping num_partitions ...`** — informational on tiny corpora, not an error.
-- **Low-relevance top hit on tiny corpus** — look at `_distance` *gaps* between hits, not absolute values.
+- **Low-relevance top hit on tiny corpus** — even an unrelated query returns *something*; trust the ranking order (the `score` field is informational, not calibrated confidence).
 - **Page-element-detection warnings during ingest** — non-fatal as long as the embedding step itself succeeds (and they're silenced on a successful run, since `ingest` is quiet by default).
 
 ## Unsupported file types (silent filter — the v2 regression mode)
diff --git a/skills/nemo-retriever/scripts/doctor.py b/skills/nemo-retriever/scripts/doctor.py
new file mode 100644
index 0000000000..9e4b547ba9
--- /dev/null
+++ b/skills/nemo-retriever/scripts/doctor.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Verify the installed `retriever` engine satisfies the skill's contract.
+
+Usage: <RETRIEVER_VENV>/bin/python skills/retriever/scripts/doctor.py
+Exits 0 if all checks pass, 1 otherwise. Always runs a LIVE ingest+query probe.
+
+The skill's one primitive is `retriever query --format evidence --hybrid` ->
+{evidence, coverage}; this doctor gates on THAT invocation and result shape. `query`'s
+DEFAULTS are unchanged (legacy `hits` output, vector-only) — `evidence`/`hybrid` are
+opt-in flags the skill passes, so neither is gated here.
+"""
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+CONTRACT_DIR = os.path.join(os.path.dirname(HERE), "contract")
+EMBED_MODEL = "nvidia/llama-nemotron-embed-1b-v2"
+# Tiny self-contained probe doc, written to a temp corpus so the live ingest+query
+# check needs no external fixture file.
+PROBE_TEXT = (
+    "Contract probe document.\n"
+    "The capital of the test corpus is Probeville.\n"
+    "This single short text file exists only so doctor.py can ingest one tiny "
+    "document and run one query to assert the live hit schema.\n"
+)
+
+results = []  # (ok: bool, label: str, detail: str)
+
+
+def check(ok, label, detail=""):
+    results.append((bool(ok), label, detail))
+
+
+def retriever_bin():
+    return shutil.which("retriever")
+
+
+def help_text(bin_path, subcmd):
+    # Force a wide terminal so the rich/click help box does not truncate long
+    # flag names (e.g. "--embed-model-na…"), which would break substring checks.
+    env = dict(os.environ, COLUMNS="200")
+    try:
+        out = subprocess.run([bin_path, subcmd, "--help"], capture_output=True, text=True, timeout=60, env=env)
+        return (out.stdout or "") + (out.stderr or "")
+    except Exception as e:  # noqa: BLE001
+        return f"__ERROR__ {e}"
+
+
+def main():
+    with open(os.path.join(CONTRACT_DIR, "cli-contract.json")) as _f:
+        contract = json.load(_f)
+    with open(os.path.join(CONTRACT_DIR, "query-result.schema.json")) as _f:
+        rr_schema = json.load(_f)
+    item_schema = rr_schema["$defs"]["evidence_item"]
+    cov_schema = rr_schema["$defs"]["coverage"]
+
+    bin_path = retriever_bin()
+    check(
+        bin_path is not None, "retriever CLI on PATH", "" if bin_path else "run skills/retriever/references/install.md"
+    )
+    if not bin_path:
+        return report()
+
+    # --- Required subcommands exist (static, no GPU) ---
+    for sub in contract.get("subcommands_required", []):
+        try:
+            rc = subprocess.run([bin_path, sub, "--help"], capture_output=True, text=True, timeout=60).returncode
+        except Exception:  # noqa: BLE001
+            rc = 1
+        check(rc == 0, f"subcommand `{sub}` exists")
+
+    # --- query flag surface: required flags present (static). `query` is now both the
+    # skill primitive and the power-user tool, so strategy knobs are allowed. ---
+    rhelp = help_text(bin_path, "query")
+    for flag in contract["query"]["required_flags"]:
+        check(flag in rhelp, f"query has {flag}")
+
+    # --- ingest flag surface (static, no GPU) ---
+    ihelp = help_text(bin_path, "ingest")
+    for flag in contract["ingest"]["required_flags"]:
+        check(flag in ihelp, f"ingest has {flag}")
+    for flag in contract["ingest"]["forbidden_flags"]:
+        check(
+            flag not in ihelp, f"ingest does NOT have {flag}", "engine changed: skill assumes single-pass auto-detect"
+        )
+
+    # --- Live probe: ingest tiny fixture, retrieve, validate result shape (GPU) ---
+    tmp = tempfile.mkdtemp(prefix="retriever_doctor_")
+    try:
+        corpus = os.path.join(tmp, "corpus")
+        os.makedirs(corpus)
+        with open(os.path.join(corpus, "contract_probe.txt"), "w") as probe_f:
+            probe_f.write(PROBE_TEXT)
+        uri = os.path.join(tmp, "lancedb")
+        table = "contract_probe"
+        ing = subprocess.run(
+            [
+                bin_path,
+                "ingest",
+                corpus + "/",
+                "--table-name",
+                table,
+                "--lancedb-uri",
+                uri,
+                "--embed-model-name",
+                EMBED_MODEL,
+                "--quiet",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=900,
+        )
+        check(ing.returncode == 0, "live ingest of fixture", ing.stderr.strip()[-300:])
+
+        r = subprocess.run(
+            [
+                bin_path,
+                "query",
+                "What is the capital of the test corpus?",
+                "--format",
+                "evidence",
+                "--top-k",
+                "3",
+                "--no-hybrid",
+                "--table-name",
+                table,
+                "--lancedb-uri",
+                uri,
+                "--embed-model-name",
+                EMBED_MODEL,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=600,
+        )
+        check(r.returncode == 0, "live query --format evidence", r.stderr.strip()[-300:])
+        result = None
+        if r.returncode == 0:
+            try:
+                result = json.loads(r.stdout)
+                check(
+                    isinstance(result, dict) and "evidence" in result and "coverage" in result,
+                    "query emits {evidence, coverage}",
+                )
+            except Exception as e:  # noqa: BLE001
+                check(False, "query stdout is JSON", str(e))
+        if isinstance(result, dict):
+            ev = result.get("evidence")
+            check(isinstance(ev, list) and len(ev) > 0, "query returned evidence")
+            if isinstance(ev, list) and ev:
+                ok, why = validate(ev[0], item_schema)
+                check(ok, "evidence item matches query-result schema", why)
+            cov = result.get("coverage")
+            check(isinstance(cov, dict), "coverage is an object")
+            if isinstance(cov, dict):
+                ok, why = validate(cov, cov_schema)
+                check(ok, "coverage matches query-result schema", why)
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
+
+    return report()
+
+
+def validate(obj, schema):
+    """Tiny dependency-free validator: required fields, types (incl. unions), enums."""
+    if not isinstance(obj, dict):
+        return False, "value is not an object"
+    for req in schema.get("required", []):
+        if req not in obj:
+            return False, f"missing required field '{req}'"
+    types = {
+        "integer": int,
+        "string": str,
+        "number": (int, float),
+        "object": dict,
+        "array": list,
+        "null": type(None),
+        "boolean": bool,
+    }
+    for name, spec in schema.get("properties", {}).items():
+        if name not in obj:
+            continue
+        if "type" in spec:
+            allowed = spec["type"] if isinstance(spec["type"], list) else [spec["type"]]
+            pytypes = []
+            for key in allowed:
+                mapped = types.get(key)
+                if mapped is None:
+                    continue
+                pytypes.extend(mapped if isinstance(mapped, tuple) else [mapped])
+            if pytypes and not isinstance(obj[name], tuple(pytypes)):
+                return False, f"field '{name}' should be {spec['type']}, got {type(obj[name]).__name__}"
+        if "enum" in spec and obj[name] not in spec["enum"]:
+            return False, f"field '{name}'={obj[name]!r} not in {spec['enum']}"
+    return True, ""
+
+
+def report():
+    failed = [r for r in results if not r[0]]
+    for ok, label, detail in results:
+        mark = "PASS" if ok else "FAIL"
+        line = f"[{mark}] {label}"
+        if detail and not ok:
+            line += f"  -- {detail}"
+        print(line)
+    print(f"\n{len(results) - len(failed)}/{len(results)} checks passed")
+    return 1 if failed else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/nemo-retriever/scripts/filename_fast_path.py b/skills/nemo-retriever/scripts/filename_fast_path.py
deleted file mode 100644
index f11bfd8223..0000000000
--- a/skills/nemo-retriever/scripts/filename_fast_path.py
+++ /dev/null
@@ -1,161 +0,0 @@
-"""Query-turn filename fast path for the nemo-retriever skill.
-
-Reads `./pdfs/` from the current working directory. If the query string
-literally contains any PDF basename (with or without the `.pdf` extension,
-stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements`
-on each matched file via pdfium, ranks pages by query-token frequency,
-and emits a top-10 ranking + the top page's raw text.
-
-Invoked from SKILL.md as:
-    <RETRIEVER_VENV>/bin/python <skill_dir>/scripts/filename_fast_path.py "$QUERY"
-
-The retriever binary is resolved from sys.executable's directory, so the
-script is portable across venvs.
-
-Stdout protocol (exactly one of):
-- `NO_MATCH\n`                    — no PDF basename in the query.
-- `NO_TEXT\n`                     — matches found but extraction produced no
-                                    text on any page (image-only PDFs).
-- `<JSON>\n---TOP_PAGE_TEXT---\n<text>` — JSON with a "ranking" list of
-                                    {doc_id, page_number, rank} (1-indexed
-                                    pages, up to 10), followed by the top-
-                                    ranked page's raw text (first 4000 chars).
-
-Exit code is 0 in all three success outcomes; non-zero only on hard errors
-(missing ./pdfs, page-elements subprocess failure, malformed sidecar JSON).
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import re
-import subprocess
-import sys
-
-PDF_DIR = "./pdfs"
-EXTRACT_OUT = "/tmp/pdf_text"
-MIN_STEM_LEN = 6
-TOP_K = 10
-TOP_PAGE_TEXT_CHARS = 4000
-
-STOPWORDS = frozenset(
-    "the a an of in on for to and or is are was were what which how when "
-    "where who why this that these those with by from as at be it its do "
-    "does did please could would should tell me you i we us our my".split()
-)
-
-
-def find_matches(query_lower: str, basenames: list[str]) -> list[str]:
-    """Return PDF basenames whose name (with or without .pdf) appears verbatim
-    in the lowercased query. Skip stems shorter than MIN_STEM_LEN."""
-    matches = []
-    for name in basenames:
-        stem, ext = os.path.splitext(name)
-        if ext.lower() != ".pdf" or len(stem) < MIN_STEM_LEN:
-            continue
-        if name.lower() in query_lower or stem.lower() in query_lower:
-            matches.append(name)
-    return matches
-
-
-def extract_pages(retriever_bin: str, matches: list[str]) -> None:
-    os.makedirs(EXTRACT_OUT, exist_ok=True)
-    for m in matches:
-        subprocess.run(
-            [
-                retriever_bin,
-                "pdf",
-                "stage",
-                "page-elements",
-                f"{PDF_DIR}/{m}",
-                "--method",
-                "pdfium",
-                "--json-output-dir",
-                EXTRACT_OUT,
-                "--compact-json",
-            ],
-            check=True,
-        )
-
-
-def sidecar_path(pdf_name: str) -> str | None:
-    stem = os.path.splitext(pdf_name)[0]
-    candidates = (
-        f"{EXTRACT_OUT}/{pdf_name}.pdf_extraction.json",
-        f"{EXTRACT_OUT}/{stem}.pdf.pdf_extraction.json",
-    )
-    for c in candidates:
-        if os.path.exists(c):
-            return c
-    return None
-
-
-def page_records(sidecar: str) -> list[dict]:
-    data = json.load(open(sidecar))
-    if isinstance(data, list):
-        return data
-    if isinstance(data, dict):
-        return data.get("pages") or data.get("documents") or []
-    return []
-
-
-def page_text(rec: dict) -> str:
-    txt = rec.get("text") or rec.get("content") or ""
-    if not txt and isinstance(rec.get("primitives"), list):
-        txt = " ".join(p.get("text", "") for p in rec["primitives"] if isinstance(p, dict))
-    return txt or ""
-
-
-def tokenize(query: str) -> list[str]:
-    return [t for t in re.split(r"[^a-z0-9]+", query.lower()) if t and t not in STOPWORDS and len(t) > 2]
-
-
-def rank_pages(matches: list[str], toks: list[str]) -> list[tuple[int, int, str, str]]:
-    """Return list of (score, page_number, doc_stem, text) sorted by
-    descending score, ascending page number."""
-    scored = []
-    for m in matches:
-        sidecar = sidecar_path(m)
-        if sidecar is None:
-            continue
-        stem = os.path.splitext(m)[0]
-        for rec in page_records(sidecar):
-            pn = rec.get("page_number") or rec.get("page") or 0
-            txt = page_text(rec)
-            score = sum(txt.lower().count(t) for t in toks)
-            if score > 0:
-                scored.append((score, pn, stem, txt))
-    scored.sort(key=lambda r: (-r[0], r[1]))
-    return scored
-
-
-def main() -> int:
-    if len(sys.argv) != 2:
-        print(f"usage: {sys.argv[0]} <query>", file=sys.stderr)
-        return 2
-    query = sys.argv[1]
-    ql = query.lower()
-    retriever_bin = os.path.join(os.path.dirname(sys.executable), "retriever")
-
-    basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf"))
-    matches = find_matches(ql, basenames)
-    if not matches:
-        print("NO_MATCH")
-        return 0
-
-    extract_pages(retriever_bin, matches)
-    scored = rank_pages(matches, tokenize(ql))
-    if not scored:
-        print("NO_TEXT")
-        return 0
-
-    ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} for i, s in enumerate(scored[:TOP_K])]
-    print(json.dumps({"ranking": ranking}))
-    print("---TOP_PAGE_TEXT---")
-    print(scored[0][3][:TOP_PAGE_TEXT_CHARS])
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/skills/nemo-retriever/scripts/grep_corpus.py b/skills/nemo-retriever/scripts/grep_corpus.py
deleted file mode 100644
index 1471b6e4c0..0000000000
--- a/skills/nemo-retriever/scripts/grep_corpus.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""Case-insensitive keyword/regex search over the corpus via the LanceDB index.
-
-This script scans the already-built LanceDB table, so it returns matches
-across every chunk `retriever ingest` indexed (text, table, chart, image
-transcriptions where present) without re-reading any PDF.
-
-Usage:
-    <RETRIEVER_VENV>/bin/python <skill_dir>/scripts/grep_corpus.py <pattern> \\
-        [--max-hits 50] [--lancedb-uri ./lancedb] [--table-name nemo-retriever]
-
-`pattern` is a Python regex, case-insensitive. For a literal-string search,
-just write the string — most identifier characters (`.`, `-`, `_`, digits,
-letters) are unambiguous unless you include regex metacharacters
-(`(`, `|`, `*`, `?`, `[`, `]`, `\\`, `^`, `$`).
-
-Output (one line per hit; sorted by pdf_basename then page_number):
-    <pdf_basename>:p<page_number>:<type>:  ...<snippet around match>...
-
-Prints `NO_MATCH` on zero hits. Caps at `--max-hits` to keep the turn output
-bounded; raise it if you really want more.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-import sys
-
-
-def main() -> int:
-    ap = argparse.ArgumentParser()
-    ap.add_argument("pattern", help="Python regex (case-insensitive)")
-    ap.add_argument("--max-hits", type=int, default=50)
-    ap.add_argument("--snippet-pad", type=int, default=60)
-    ap.add_argument("--lancedb-uri", default="./lancedb")
-    ap.add_argument("--table-name", default="nemo-retriever")
-    args = ap.parse_args()
-
-    try:
-        import lancedb
-    except ImportError:
-        print("ERROR: lancedb not importable. Run with <RETRIEVER_VENV>/bin/python.", file=sys.stderr)
-        return 1
-
-    try:
-        pat = re.compile(args.pattern, re.IGNORECASE)
-    except re.error as e:
-        print(f"ERROR: bad regex {args.pattern!r}: {e}", file=sys.stderr)
-        return 2
-
-    try:
-        db = lancedb.connect(args.lancedb_uri)
-        tbl = db.open_table(args.table_name)
-    except Exception as e:
-        print(f"ERROR: can't open lancedb table {args.table_name!r} at " f"{args.lancedb_uri!r}: {e}", file=sys.stderr)
-        return 1
-
-    rows = tbl.to_pandas()
-    if "text" not in rows.columns:
-        print(f"ERROR: lancedb table has no 'text' column. columns={list(rows.columns)}", file=sys.stderr)
-        return 1
-
-    hits = []
-    for row in rows.itertuples(index=False):
-        text = getattr(row, "text", "") or ""
-        m = pat.search(text)
-        if not m:
-            continue
-        pdf = getattr(row, "pdf_basename", "?")
-        page = getattr(row, "page_number", "?")
-        meta_raw = getattr(row, "metadata", "") or ""
-        if isinstance(meta_raw, str):
-            try:
-                meta = json.loads(meta_raw) if meta_raw else {}
-            except json.JSONDecodeError:
-                meta = {}
-        elif isinstance(meta_raw, dict):
-            meta = meta_raw
-        else:
-            meta = {}
-        type_ = meta.get("type", "?")
-        start = max(0, m.start() - args.snippet_pad)
-        end = min(len(text), m.end() + args.snippet_pad)
-        snippet = text[start:end].replace("\n", " ")
-        hits.append((pdf, page, type_, snippet))
-
-    hits.sort(key=lambda h: (str(h[0]), int(h[1]) if isinstance(h[1], (int, float)) else 0))
-    for pdf, page, type_, snippet in hits[: args.max_hits]:
-        print(f"{pdf}:p{page}:{type_}:  ...{snippet}...")
-    if not hits:
-        print("NO_MATCH")
-    elif len(hits) > args.max_hits:
-        print(f"... ({len(hits) - args.max_hits} more matches truncated; " f"raise --max-hits to see them)")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())