Skip to content
12 changes: 12 additions & 0 deletions src/cocoindex_code/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ def _search_with_wait_spinner(
query: str,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
mode: str = "semantic",
limit: int = 10,
offset: int = 0,
) -> SearchResponse:
Expand All @@ -248,6 +250,8 @@ def _on_waiting() -> None:
query=query,
languages=languages,
paths=paths,
exclude_paths=exclude_paths,
mode=mode,
limit=limit,
offset=offset,
on_waiting=_on_waiting,
Expand Down Expand Up @@ -540,6 +544,8 @@ def search(
query: list[str] = _typer.Argument(..., help="Search query"),
lang: list[str] = _typer.Option([], "--lang", help="Filter by language"),
path: str | None = _typer.Option(None, "--path", help="Filter by file path glob"),
exclude: list[str] = _typer.Option([], "--exclude", help="Exclude file path glob"),
mode: str = _typer.Option("semantic", "--mode", help="Search mode: 'semantic' or 'hybrid'"),
offset: int = _typer.Option(0, "--offset", help="Number of results to skip"),
limit: int = _typer.Option(10, "--limit", help="Maximum results to return"),
refresh: bool = _typer.Option(False, "--refresh", help="Refresh index before searching"),
Expand All @@ -548,6 +554,10 @@ def search(
project_root = str(require_project_root())
query_str = " ".join(query)

if mode not in ("semantic", "hybrid"):
_typer.echo(f"Error: invalid mode '{mode}'. Must be 'semantic' or 'hybrid'.", err=True)
raise SystemExit(1)

if refresh:
_run_index_with_progress(project_root)

Expand All @@ -565,6 +575,8 @@ def search(
query=query_str,
languages=lang or None,
paths=paths,
exclude_paths=exclude or None,
mode=mode,
limit=limit,
offset=offset,
)
Expand Down
4 changes: 4 additions & 0 deletions src/cocoindex_code/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,8 @@ def search(
query: str,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
mode: str = "semantic",
limit: int = 5,
offset: int = 0,
on_waiting: Callable[[], None] | None = None,
Expand All @@ -298,6 +300,8 @@ def search(
query=query,
languages=languages,
paths=paths,
exclude_paths=exclude_paths,
mode=mode,
limit=limit,
offset=offset,
)
Expand Down
4 changes: 4 additions & 0 deletions src/cocoindex_code/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ async def _search_with_wait(
query=req.query,
languages=req.languages,
paths=req.paths,
exclude_paths=req.exclude_paths,
mode=req.mode,
limit=req.limit,
offset=req.offset,
)
Expand Down Expand Up @@ -488,6 +490,8 @@ async def _dispatch(
query=req.query,
languages=req.languages,
paths=req.paths,
exclude_paths=req.exclude_paths,
mode=req.mode,
limit=req.limit,
offset=req.offset,
)
Expand Down
4 changes: 4 additions & 0 deletions src/cocoindex_code/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ async def search(
query: str,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
mode: str = "semantic",
limit: int = 5,
offset: int = 0,
) -> list[SearchResult]:
Expand All @@ -192,6 +194,8 @@ async def search(
offset=offset,
languages=languages,
paths=paths,
exclude_paths=exclude_paths,
mode=mode,
)
return [
SearchResult(
Expand Down
3 changes: 2 additions & 1 deletion src/cocoindex_code/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class SearchRequest(_msgspec.Struct, tag="search"):
query: str
languages: list[str] | None = None
paths: list[str] | None = None
exclude_paths: list[str] | None = None
mode: str = "semantic"
limit: int = 5
offset: int = 0

Expand Down Expand Up @@ -101,7 +103,6 @@ class IndexProgressUpdate(_msgspec.Struct, tag="index_progress"):
class IndexWaitingNotice(_msgspec.Struct, tag="index_waiting"):
"""Sent when another indexing is already in progress and the client must wait."""

pass


class SearchResult(_msgspec.Struct):
Expand Down
194 changes: 185 additions & 9 deletions src/cocoindex_code/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import heapq
import re
import sqlite3
from pathlib import Path
from typing import Any
Expand All @@ -16,6 +17,148 @@ def _l2_to_score(distance: float) -> float:
return 1.0 - distance * distance / 2.0


# RRF constants (Cormack, Clarke & Buettcher, 2009)
_RRF_K = 60
_RRF_CONSENSUS_BOOST = 0.003 # Small bonus for items appearing in both result sets

# Minimum keyword length for hybrid search tokenization.
# Set to 2 to include short but meaningful code terms (io, go, fs, db, etc.).
_MIN_KEYWORD_LENGTH = 2

# Regex for extracting meaningful tokens from a query string.
_TOKEN_RE = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")


def _extract_keywords(query: str) -> list[str]:
"""Extract meaningful keywords from a query string.

Uses regex tokenization to handle code-like terms (e.g. ``io``, ``db``,
``async_handler``) better than naive whitespace splitting.
"""
return [
tok
for tok in _TOKEN_RE.findall(query.lower())
if len(tok) >= _MIN_KEYWORD_LENGTH
]


def _keyword_query(
conn: sqlite3.Connection,
keywords: list[str],
limit: int,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
) -> list[tuple[str, str, str, int, int, int]]:
"""Keyword search using INSTR for term matching.

Returns rows with a ``match_count`` column indicating how many of the
*keywords* appear in the chunk content. A CTE is used so that the
filtering and ordering operate on a well-defined column alias.
"""
conditions: list[str] = []
params: list[Any] = []

# Build per-keyword CASE expressions
match_expr = " + ".join(
"(CASE WHEN INSTR(LOWER(content), ?) > 0 THEN 1 ELSE 0 END)"
for _ in keywords
)
params.extend(keywords)

if languages:
placeholders = ",".join("?" for _ in languages)
conditions.append(f"language IN ({placeholders})")
params.extend(languages)

if paths:
path_clauses = " OR ".join("file_path GLOB ?" for _ in paths)
conditions.append(f"({path_clauses})")
params.extend(paths)

if exclude_paths:
exclude_clauses = " AND ".join("file_path NOT GLOB ?" for _ in exclude_paths)
conditions.append(f"({exclude_clauses})")
params.extend(exclude_paths)

where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.append(limit)

# Use a CTE to compute match_count, then filter and sort in the outer query.
# This avoids non-standard HAVING-without-GROUP-BY.
return conn.execute(
f"""
WITH scored AS (
SELECT file_path, language, content, start_line, end_line,
({match_expr}) AS match_count
FROM code_chunks_vec
{where}
)
SELECT file_path, language, content, start_line, end_line, match_count
FROM scored
WHERE match_count > 0
ORDER BY match_count DESC
LIMIT ?
""",
params,
).fetchall()


def _fuse_rrf(
vector_results: list[QueryResult],
keyword_results: list[tuple[str, str, str, int, int, int]],
limit: int,
) -> list[QueryResult]:
"""Fuse vector and keyword results using Reciprocal Rank Fusion.

RRF operates on rank positions rather than raw scores, making it robust
to scale incompatibility between embedding distances and keyword match
counts.

Formula: ``RRF_score(d) = sum(1 / (k + rank_i(d))) + consensus_boost``
"""
scores: dict[str, float] = {}
vector_map: dict[str, QueryResult] = {}
keyword_map: dict[str, tuple] = {}

# Score vector results by rank
for rank, r in enumerate(vector_results, start=1):
key = f"{r.file_path}:{r.start_line}"
scores[key] = scores.get(key, 0.0) + 1.0 / (_RRF_K + rank)
vector_map[key] = r

# Score keyword results by rank
for rank, row in enumerate(keyword_results, start=1):
file_path, language, content, start_line, end_line, match_count = row
key = f"{file_path}:{start_line}"
scores[key] = scores.get(key, 0.0) + 1.0 / (_RRF_K + rank)
if key not in keyword_map:
keyword_map[key] = row

# Consensus boost: items in both lists get a small bonus
consensus = set(vector_map.keys()) & set(keyword_map.keys())
for key in consensus:
scores[key] += _RRF_CONSENSUS_BOOST

# Build final results sorted by RRF score
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
results: list[QueryResult] = []
for key, rrf_score in ranked[:limit]:
if key in vector_map:
r = vector_map[key]
results.append(QueryResult(
file_path=r.file_path, language=r.language, content=r.content,
start_line=r.start_line, end_line=r.end_line, score=rrf_score,
))
elif key in keyword_map:
fp, lang, content, sl, el, _ = keyword_map[key]
results.append(QueryResult(
file_path=fp, language=lang, content=content,
start_line=sl, end_line=el, score=rrf_score,
))
return results


def _knn_query(
conn: sqlite3.Connection,
embedding_bytes: bytes,
Expand Down Expand Up @@ -51,6 +194,7 @@ def _full_scan_query(
offset: int,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
) -> list[tuple[Any, ...]]:
"""Full scan with SQL-level distance computation and filtering."""
conditions: list[str] = []
Expand All @@ -66,6 +210,11 @@ def _full_scan_query(
conditions.append(f"({path_clauses})")
params.extend(paths)

if exclude_paths:
exclude_clauses = " AND ".join("file_path NOT GLOB ?" for _ in exclude_paths)
conditions.append(f"({exclude_clauses})")
params.extend(exclude_paths)

where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.extend([limit, offset])

Expand All @@ -90,14 +239,22 @@ async def query_codebase(
offset: int = 0,
languages: list[str] | None = None,
paths: list[str] | None = None,
exclude_paths: list[str] | None = None,
mode: str = "semantic",
) -> list[QueryResult]:
"""
Perform vector similarity search using vec0 KNN index.
Perform codebase search.

Modes:
- "semantic" (default): vector similarity search via vec0 KNN index
- "hybrid": combines vector + keyword search with Reciprocal Rank Fusion

Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
Language filtering uses vec0 partition keys for exact index-level filtering.
Path filtering triggers a full scan with distance computation.
"""
if mode not in ("semantic", "hybrid"):
raise ValueError(f"Invalid search mode: {mode!r}. Must be 'semantic' or 'hybrid'.")

if not target_sqlite_db_path.exists():
raise RuntimeError(
f"Index database not found at {target_sqlite_db_path}. "
Expand All @@ -110,17 +267,23 @@ async def query_codebase(

# Generate query embedding.
query_embedding = await embedder.embed(query, **query_params)

embedding_bytes = query_embedding.astype("float32").tobytes()

# For hybrid mode, fetch more vector results before fusion so that
# the RRF merge has a larger candidate pool. The final limit is
# applied *after* fusion.
vector_fetch_limit = limit * 3 if mode == "hybrid" else limit

with db.readonly() as conn:
if paths:
rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths)
if paths or exclude_paths:
rows = _full_scan_query(
conn, embedding_bytes, vector_fetch_limit, offset, languages, paths, exclude_paths,
)
elif not languages or len(languages) == 1:
lang = languages[0] if languages else None
rows = _knn_query(conn, embedding_bytes, limit + offset, lang)
rows = _knn_query(conn, embedding_bytes, vector_fetch_limit + offset, lang)
else:
fetch_k = limit + offset
fetch_k = vector_fetch_limit + offset
rows = heapq.nsmallest(
fetch_k,
(
Expand All @@ -131,10 +294,10 @@ async def query_codebase(
key=lambda r: r[5],
)

if not paths:
if not paths and not exclude_paths:
rows = rows[offset:]

return [
vector_results = [
QueryResult(
file_path=file_path,
language=language,
Expand All @@ -145,3 +308,16 @@ async def query_codebase(
)
for file_path, language, content, start_line, end_line, distance in rows
]

if mode == "hybrid":
keywords = _extract_keywords(query)
if keywords:
with db.readonly() as conn:
keyword_rows = _keyword_query(
conn, keywords, limit * 3, languages, paths, exclude_paths,
)
return _fuse_rrf(vector_results, keyword_rows, limit)

# For semantic mode, trim to requested limit (vector_fetch_limit may
# have been larger when hybrid was requested but no keywords found).
return vector_results[:limit]
Loading