From ade1824e8b6eb9915a5dd62c2dfe523a5520e82c Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 20:34:55 -0300 Subject: [PATCH 1/6] feat: add hybrid search mode with Reciprocal Rank Fusion Adds a 'hybrid' search mode that combines vector similarity search with keyword matching using Reciprocal Rank Fusion (RRF). RRF operates on rank positions rather than raw scores, making it robust to scale incompatibility between embedding distances and keyword match counts. Items appearing in both result sets receive a consensus boost. Formula: RRF_score(d) = sum(1/(k + rank_i(d))) + consensus_boost Reference: Cormack, Clarke & Buettcher (2009) The keyword search uses INSTR-based term matching on the existing code_chunks_vec table, requiring no schema changes or FTS5 setup. Also includes exclude_paths support (from #146) as it shares the same full-scan query path. Usage: MCP: {"query": "auth", "mode": "hybrid"} CLI: ccc search 'auth' --mode hybrid Closes #44 --- src/cocoindex_code/cli.py | 8 ++ src/cocoindex_code/client.py | 4 + src/cocoindex_code/daemon.py | 4 + src/cocoindex_code/project.py | 4 + src/cocoindex_code/protocol.py | 2 + src/cocoindex_code/query.py | 137 +++++++++++++++++++++++++++++++-- src/cocoindex_code/server.py | 16 ++++ 7 files changed, 169 insertions(+), 6 deletions(-) diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 4cc0a48..b6167ec 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -223,6 +223,8 @@ def _search_with_wait_spinner( query: str, languages: list[str] | None = None, paths: list[str] | None = None, + exclude_paths: list[str] | None = None, + mode: str = "semantic", limit: int = 10, offset: int = 0, ) -> SearchResponse: @@ -248,6 +250,8 @@ def _on_waiting() -> None: query=query, languages=languages, paths=paths, + exclude_paths=exclude_paths, + mode=mode, limit=limit, offset=offset, on_waiting=_on_waiting, @@ -540,6 +544,8 @@ def search( query: list[str] = _typer.Argument(..., help="Search query"), lang: list[str] = _typer.Option([], "--lang", help="Filter by language"), path: str | None = _typer.Option(None, "--path", help="Filter by file path glob"), + exclude: list[str] = _typer.Option([], "--exclude", help="Exclude file path glob"), + mode: str = _typer.Option("semantic", "--mode", help="Search mode: semantic or hybrid"), offset: int = _typer.Option(0, "--offset", help="Number of results to skip"), limit: int = _typer.Option(10, "--limit", help="Maximum results to return"), refresh: bool = _typer.Option(False, "--refresh", help="Refresh index before searching"), @@ -565,6 +571,8 @@ def search( query=query_str, languages=lang or None, paths=paths, + exclude_paths=exclude or None, + mode=mode, limit=limit, offset=offset, ) diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index 262af87..e191660 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -278,6 +278,8 @@ def search( query: str, languages: list[str] | None = None, paths: list[str] | None = None, + exclude_paths: list[str] | None = None, + mode: str = "semantic", limit: int = 5, offset: int = 0, on_waiting: Callable[[], None] | None = None, @@ -298,6 +300,8 @@ def search( query=query, languages=languages, paths=paths, + exclude_paths=exclude_paths, + mode=mode, limit=limit, offset=offset, ) diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index 41334bc..116f756 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -275,6 +275,8 @@ async def _search_with_wait( query=req.query, languages=req.languages, paths=req.paths, + exclude_paths=req.exclude_paths, + mode=req.mode, limit=req.limit, offset=req.offset, ) @@ -488,6 +490,8 @@ async def _dispatch( query=req.query, languages=req.languages, paths=req.paths, + exclude_paths=req.exclude_paths, + mode=req.mode, limit=req.limit, offset=req.offset, ) diff --git a/src/cocoindex_code/project.py b/src/cocoindex_code/project.py index f661c21..82e95f7 100644 --- a/src/cocoindex_code/project.py +++ b/src/cocoindex_code/project.py @@ -179,6 +179,8 @@ async def search( query: str, languages: list[str] | None = None, paths: list[str] | None = None, + exclude_paths: list[str] | None = None, + mode: str = "semantic", limit: int = 5, offset: int = 0, ) -> list[SearchResult]: @@ -192,6 +194,8 @@ async def search( offset=offset, languages=languages, paths=paths, + exclude_paths=exclude_paths, + mode=mode, ) return [ SearchResult( diff --git a/src/cocoindex_code/protocol.py b/src/cocoindex_code/protocol.py index b584a4d..b0998b2 100644 --- a/src/cocoindex_code/protocol.py +++ b/src/cocoindex_code/protocol.py @@ -22,6 +22,8 @@ class SearchRequest(_msgspec.Struct, tag="search"): query: str languages: list[str] | None = None paths: list[str] | None = None + exclude_paths: list[str] | None = None + mode: str = "semantic" limit: int = 5 offset: int = 0 diff --git a/src/cocoindex_code/query.py b/src/cocoindex_code/query.py index a2991ee..72b3404 100644 --- a/src/cocoindex_code/query.py +++ b/src/cocoindex_code/query.py @@ -16,6 +16,108 @@ def _l2_to_score(distance: float) -> float: return 1.0 - distance * distance / 2.0 +_RRF_K = 60 # Standard RRF constant (Cormack et al., 2009) + + +def _keyword_query( + conn: sqlite3.Connection, + keywords: list[str], + limit: int, + languages: list[str] | None = None, + paths: list[str] | None = None, + exclude_paths: list[str] | None = None, +) -> list[tuple[str, str, str, int, int, int]]: + """Keyword search using INSTR for term matching. Returns rows with match count.""" + conditions: list[str] = [] + params: list[Any] = [] + + # Count keyword matches in content (case-insensitive via LOWER) + match_expr = " + ".join( + f"(CASE WHEN INSTR(LOWER(content), LOWER(?)) > 0 THEN 1 ELSE 0 END)" + for _ in keywords + ) + params.extend(keywords) + + if languages: + placeholders = ",".join("?" for _ in languages) + conditions.append(f"language IN ({placeholders})") + params.extend(languages) + + if paths: + path_clauses = " OR ".join("file_path GLOB ?" for _ in paths) + conditions.append(f"({path_clauses})") + params.extend(paths) + + if exclude_paths: + exclude_clauses = " AND ".join("file_path NOT GLOB ?" for _ in exclude_paths) + conditions.append(f"({exclude_clauses})") + params.extend(exclude_paths) + + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" + params.append(limit) + + return conn.execute( + f""" + SELECT file_path, language, content, start_line, end_line, + ({match_expr}) as match_count + FROM code_chunks_vec + {where} + HAVING match_count > 0 + ORDER BY match_count DESC + LIMIT ? + """, + params, + ).fetchall() + + +def _fuse_rrf( + vector_results: list[QueryResult], + keyword_results: list[tuple[str, str, str, int, int, int]], + limit: int, +) -> list[QueryResult]: + """Fuse vector and keyword results using Reciprocal Rank Fusion.""" + scores: dict[str, float] = {} + vector_map: dict[str, QueryResult] = {} + keyword_map: dict[str, tuple] = {} + + # Score vector results by rank + for rank, r in enumerate(vector_results, start=1): + key = f"{r.file_path}:{r.start_line}" + scores[key] = scores.get(key, 0.0) + 1.0 / (_RRF_K + rank) + vector_map[key] = r + + # Score keyword results by rank + for rank, row in enumerate(keyword_results, start=1): + file_path, language, content, start_line, end_line, match_count = row + key = f"{file_path}:{start_line}" + scores[key] = scores.get(key, 0.0) + 1.0 / (_RRF_K + rank) + if key not in keyword_map: + keyword_map[key] = row + + # Consensus boost: items in both lists get a small bonus + consensus = set(vector_map.keys()) & set(keyword_map.keys()) + for key in consensus: + scores[key] += 0.003 + + # Build final results sorted by RRF score + ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) + results: list[QueryResult] = [] + for key, rrf_score in ranked[:limit]: + if key in vector_map: + r = vector_map[key] + results.append(QueryResult( + file_path=r.file_path, language=r.language, content=r.content, + start_line=r.start_line, end_line=r.end_line, score=rrf_score, + )) + elif key in keyword_map: + fp, lang, content, sl, el, _ = keyword_map[key] + results.append(QueryResult( + file_path=fp, language=lang, content=content, + start_line=sl, end_line=el, score=rrf_score, + )) + return results + + def _knn_query( conn: sqlite3.Connection, embedding_bytes: bytes, @@ -51,6 +153,7 @@ def _full_scan_query( offset: int, languages: list[str] | None = None, paths: list[str] | None = None, + exclude_paths: list[str] | None = None, ) -> list[tuple[Any, ...]]: """Full scan with SQL-level distance computation and filtering.""" conditions: list[str] = [] @@ -66,6 +169,11 @@ def _full_scan_query( conditions.append(f"({path_clauses})") params.extend(paths) + if exclude_paths: + exclude_clauses = " AND ".join("file_path NOT GLOB ?" for _ in exclude_paths) + conditions.append(f"({exclude_clauses})") + params.extend(exclude_paths) + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" params.extend([limit, offset]) @@ -90,11 +198,15 @@ async def query_codebase( offset: int = 0, languages: list[str] | None = None, paths: list[str] | None = None, + exclude_paths: list[str] | None = None, + mode: str = "semantic", ) -> list[QueryResult]: """ - Perform vector similarity search using vec0 KNN index. + Perform codebase search. - Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search. + Modes: + - "semantic" (default): vector similarity search via vec0 KNN index + - "hybrid": combines vector + keyword search with Reciprocal Rank Fusion Language filtering uses vec0 partition keys for exact index-level filtering. Path filtering triggers a full scan with distance computation. """ @@ -114,8 +226,8 @@ async def query_codebase( embedding_bytes = query_embedding.astype("float32").tobytes() with db.readonly() as conn: - if paths: - rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths) + if paths or exclude_paths: + rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths, exclude_paths) elif not languages or len(languages) == 1: lang = languages[0] if languages else None rows = _knn_query(conn, embedding_bytes, limit + offset, lang) @@ -131,10 +243,10 @@ async def query_codebase( key=lambda r: r[5], ) - if not paths: + if not paths and not exclude_paths: rows = rows[offset:] - return [ + vector_results = [ QueryResult( file_path=file_path, language=language, @@ -145,3 +257,16 @@ async def query_codebase( ) for file_path, language, content, start_line, end_line, distance in rows ] + + if mode == "hybrid": + # Extract keywords from query (simple tokenization) + keywords = [w for w in query.lower().split() if len(w) >= 3] + if keywords: + with db.readonly() as conn: + keyword_rows = _keyword_query( + conn, keywords, limit * 2, languages, paths, exclude_paths + ) + return _fuse_rrf(vector_results, keyword_rows, limit) + + return vector_results + diff --git a/src/cocoindex_code/server.py b/src/cocoindex_code/server.py index 2708c86..c061ea9 100644 --- a/src/cocoindex_code/server.py +++ b/src/cocoindex_code/server.py @@ -117,6 +117,20 @@ async def search( " Example: ['src/utils/*', '*.py']" ), ), + exclude_paths: list[str] | None = Field( + default=None, + description=( + "Exclude file path pattern(s) using GLOB wildcards (* and ?)." + " Example: ['tests/*', 'vendor/*']" + ), + ), + mode: str = Field( + default="semantic", + description=( + "Search mode: 'semantic' (vector similarity, default)" + " or 'hybrid' (combines vector + keyword search with RRF)." + ), + ), ) -> SearchResultModel: """Query the codebase index via the daemon.""" from . import client as _client @@ -132,6 +146,8 @@ async def search( query=query, languages=languages, paths=paths, + exclude_paths=exclude_paths, + mode=mode, limit=limit, offset=offset, ), From de952749de45e6df34eb01943cdef45988d0109d Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 20:40:06 -0300 Subject: [PATCH 2/6] test: add tests for hybrid search RRF and exclude_paths - Protocol: encode/decode with exclude_paths, mode, backward compat - RRF fusion: vector-only, keyword-only, consensus boost, limit, score formula - 8 tests, all passing --- tests/test_hybrid_search.py | 117 ++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/test_hybrid_search.py diff --git a/tests/test_hybrid_search.py b/tests/test_hybrid_search.py new file mode 100644 index 0000000..1513b04 --- /dev/null +++ b/tests/test_hybrid_search.py @@ -0,0 +1,117 @@ +"""Tests for hybrid search (RRF) and exclude_paths functionality.""" + +from __future__ import annotations + +import pytest + +from cocoindex_code.protocol import SearchRequest, encode_request, decode_request +from cocoindex_code.query import _fuse_rrf, _RRF_K +from cocoindex_code.schema import QueryResult + + +# --- Protocol tests --- + + +def test_encode_decode_search_request_with_exclude_paths() -> None: + req = SearchRequest( + project_root="/tmp/test", + query="auth", + exclude_paths=["i18n/*", "*.min.js"], + ) + data = encode_request(req) + decoded = decode_request(data) + assert isinstance(decoded, SearchRequest) + assert decoded.exclude_paths == ["i18n/*", "*.min.js"] + assert decoded.mode == "semantic" + + +def test_encode_decode_search_request_with_hybrid_mode() -> None: + req = SearchRequest( + project_root="/tmp/test", + query="auth", + mode="hybrid", + ) + data = encode_request(req) + decoded = decode_request(data) + assert isinstance(decoded, SearchRequest) + assert decoded.mode == "hybrid" + + +def test_encode_decode_search_request_backward_compat() -> None: + """Existing requests without new fields should still decode correctly.""" + req = SearchRequest( + project_root="/tmp/test", + query="auth", + ) + data = encode_request(req) + decoded = decode_request(data) + assert decoded.exclude_paths is None + assert decoded.mode == "semantic" + + +# --- RRF fusion tests --- + + +def _make_result(path: str, line: int, score: float) -> QueryResult: + return QueryResult( + file_path=path, + language="python", + content=f"# code at {path}:{line}", + start_line=line, + end_line=line + 10, + score=score, + ) + + +def test_fuse_rrf_vector_only() -> None: + """When keyword results are empty, RRF returns vector results ranked.""" + vector = [ + _make_result("a.py", 1, 0.9), + _make_result("b.py", 1, 0.8), + ] + result = _fuse_rrf(vector, [], limit=5) + assert len(result) == 2 + assert result[0].file_path == "a.py" + assert result[1].file_path == "b.py" + + +def test_fuse_rrf_keyword_only() -> None: + """When vector results are empty, RRF returns keyword results.""" + keyword = [ + ("c.py", "python", "# code", 1, 10, 3), + ("d.py", "python", "# code", 1, 10, 1), + ] + result = _fuse_rrf([], keyword, limit=5) + assert len(result) == 2 + assert result[0].file_path == "c.py" + + +def test_fuse_rrf_consensus_boost() -> None: + """Items in both lists should rank higher than items in only one.""" + # "a.py:1" appears in both lists + vector = [ + _make_result("a.py", 1, 0.9), + _make_result("b.py", 1, 0.8), + ] + keyword = [ + ("a.py", "python", "# code", 1, 10, 2), + ("c.py", "python", "# code", 1, 10, 1), + ] + result = _fuse_rrf(vector, keyword, limit=5) + # a.py:1 should be first (appears in both + consensus boost) + assert result[0].file_path == "a.py" + + +def test_fuse_rrf_respects_limit() -> None: + vector = [_make_result(f"f{i}.py", 1, 0.9 - i * 0.1) for i in range(10)] + result = _fuse_rrf(vector, [], limit=3) + assert len(result) == 3 + + +def test_fuse_rrf_score_formula() -> None: + """Verify RRF score follows 1/(k+rank) formula.""" + vector = [_make_result("a.py", 1, 0.9)] + keyword = [("a.py", "python", "# code", 1, 10, 1)] + result = _fuse_rrf(vector, keyword, limit=1) + expected = 1.0 / (_RRF_K + 1) + 1.0 / (_RRF_K + 1) + 0.003 # both rank 1 + boost + assert abs(result[0].score - expected) < 1e-6 From a92c2a9eb2e5ef970f73ef68284ee9afdb81a7cd Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 20:56:50 -0300 Subject: [PATCH 3/6] fix: address Gemini review feedback on hybrid search - Use CTE instead of non-standard HAVING without GROUP BY - Fetch more vector results before RRF fusion (limit*3) to avoid missing relevant matches that rank just outside the vector limit - Extract _RRF_CONSENSUS_BOOST as named constant (was magic 0.003) - Lower min keyword length to 2 (captures io, go, fs, db) - Use regex tokenizer instead of naive split for code-like terms - Validate mode parameter in query_codebase() and CLI - Add 5 new tests for keyword extraction (13 total) --- src/cocoindex_code/cli.py | 6 ++- src/cocoindex_code/query.py | 91 +++++++++++++++++++++++++++++-------- tests/test_hybrid_search.py | 51 +++++++++++++++++++-- 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index b6167ec..db36b0e 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -545,7 +545,7 @@ def search( lang: list[str] = _typer.Option([], "--lang", help="Filter by language"), path: str | None = _typer.Option(None, "--path", help="Filter by file path glob"), exclude: list[str] = _typer.Option([], "--exclude", help="Exclude file path glob"), - mode: str = _typer.Option("semantic", "--mode", help="Search mode: semantic or hybrid"), + mode: str = _typer.Option("semantic", "--mode", help="Search mode: 'semantic' or 'hybrid'"), offset: int = _typer.Option(0, "--offset", help="Number of results to skip"), limit: int = _typer.Option(10, "--limit", help="Maximum results to return"), refresh: bool = _typer.Option(False, "--refresh", help="Refresh index before searching"), @@ -554,6 +554,10 @@ def search( project_root = str(require_project_root()) query_str = " ".join(query) + if mode not in ("semantic", "hybrid"): + _typer.echo(f"Error: invalid mode '{mode}'. Must be 'semantic' or 'hybrid'.", err=True) + raise SystemExit(1) + if refresh: _run_index_with_progress(project_root) diff --git a/src/cocoindex_code/query.py b/src/cocoindex_code/query.py index 72b3404..731ec51 100644 --- a/src/cocoindex_code/query.py +++ b/src/cocoindex_code/query.py @@ -3,6 +3,7 @@ from __future__ import annotations import heapq +import re import sqlite3 from pathlib import Path from typing import Any @@ -16,7 +17,29 @@ def _l2_to_score(distance: float) -> float: return 1.0 - distance * distance / 2.0 -_RRF_K = 60 # Standard RRF constant (Cormack et al., 2009) +# RRF constants (Cormack, Clarke & Buettcher, 2009) +_RRF_K = 60 +_RRF_CONSENSUS_BOOST = 0.003 # Small bonus for items appearing in both result sets + +# Minimum keyword length for hybrid search tokenization. +# Set to 2 to include short but meaningful code terms (io, go, fs, db, etc.). +_MIN_KEYWORD_LENGTH = 2 + +# Regex for extracting meaningful tokens from a query string. +_TOKEN_RE = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") + + +def _extract_keywords(query: str) -> list[str]: + """Extract meaningful keywords from a query string. + + Uses regex tokenization to handle code-like terms (e.g. ``io``, ``db``, + ``async_handler``) better than naive whitespace splitting. + """ + return [ + tok + for tok in _TOKEN_RE.findall(query.lower()) + if len(tok) >= _MIN_KEYWORD_LENGTH + ] def _keyword_query( @@ -27,13 +50,18 @@ def _keyword_query( paths: list[str] | None = None, exclude_paths: list[str] | None = None, ) -> list[tuple[str, str, str, int, int, int]]: - """Keyword search using INSTR for term matching. Returns rows with match count.""" + """Keyword search using INSTR for term matching. + + Returns rows with a ``match_count`` column indicating how many of the + *keywords* appear in the chunk content. A CTE is used so that the + filtering and ordering operate on a well-defined column alias. + """ conditions: list[str] = [] params: list[Any] = [] - # Count keyword matches in content (case-insensitive via LOWER) + # Build per-keyword CASE expressions match_expr = " + ".join( - f"(CASE WHEN INSTR(LOWER(content), LOWER(?)) > 0 THEN 1 ELSE 0 END)" + "(CASE WHEN INSTR(LOWER(content), ?) > 0 THEN 1 ELSE 0 END)" for _ in keywords ) params.extend(keywords) @@ -56,13 +84,19 @@ def _keyword_query( where = f"WHERE {' AND '.join(conditions)}" if conditions else "" params.append(limit) + # Use a CTE to compute match_count, then filter and sort in the outer query. + # This avoids non-standard HAVING-without-GROUP-BY. return conn.execute( f""" - SELECT file_path, language, content, start_line, end_line, - ({match_expr}) as match_count - FROM code_chunks_vec - {where} - HAVING match_count > 0 + WITH scored AS ( + SELECT file_path, language, content, start_line, end_line, + ({match_expr}) AS match_count + FROM code_chunks_vec + {where} + ) + SELECT file_path, language, content, start_line, end_line, match_count + FROM scored + WHERE match_count > 0 ORDER BY match_count DESC LIMIT ? """, @@ -75,7 +109,14 @@ def _fuse_rrf( keyword_results: list[tuple[str, str, str, int, int, int]], limit: int, ) -> list[QueryResult]: - """Fuse vector and keyword results using Reciprocal Rank Fusion.""" + """Fuse vector and keyword results using Reciprocal Rank Fusion. + + RRF operates on rank positions rather than raw scores, making it robust + to scale incompatibility between embedding distances and keyword match + counts. + + Formula: ``RRF_score(d) = sum(1 / (k + rank_i(d))) + consensus_boost`` + """ scores: dict[str, float] = {} vector_map: dict[str, QueryResult] = {} keyword_map: dict[str, tuple] = {} @@ -97,7 +138,7 @@ def _fuse_rrf( # Consensus boost: items in both lists get a small bonus consensus = set(vector_map.keys()) & set(keyword_map.keys()) for key in consensus: - scores[key] += 0.003 + scores[key] += _RRF_CONSENSUS_BOOST # Build final results sorted by RRF score ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) @@ -207,9 +248,13 @@ async def query_codebase( Modes: - "semantic" (default): vector similarity search via vec0 KNN index - "hybrid": combines vector + keyword search with Reciprocal Rank Fusion + Language filtering uses vec0 partition keys for exact index-level filtering. Path filtering triggers a full scan with distance computation. """ + if mode not in ("semantic", "hybrid"): + raise ValueError(f"Invalid search mode: {mode!r}. Must be 'semantic' or 'hybrid'.") + if not target_sqlite_db_path.exists(): raise RuntimeError( f"Index database not found at {target_sqlite_db_path}. " @@ -222,17 +267,23 @@ async def query_codebase( # Generate query embedding. query_embedding = await embedder.embed(query, **query_params) - embedding_bytes = query_embedding.astype("float32").tobytes() + # For hybrid mode, fetch more vector results before fusion so that + # the RRF merge has a larger candidate pool. The final limit is + # applied *after* fusion. + vector_fetch_limit = limit * 3 if mode == "hybrid" else limit + with db.readonly() as conn: if paths or exclude_paths: - rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths, exclude_paths) + rows = _full_scan_query( + conn, embedding_bytes, vector_fetch_limit, offset, languages, paths, exclude_paths, + ) elif not languages or len(languages) == 1: lang = languages[0] if languages else None - rows = _knn_query(conn, embedding_bytes, limit + offset, lang) + rows = _knn_query(conn, embedding_bytes, vector_fetch_limit + offset, lang) else: - fetch_k = limit + offset + fetch_k = vector_fetch_limit + offset rows = heapq.nsmallest( fetch_k, ( @@ -259,14 +310,14 @@ async def query_codebase( ] if mode == "hybrid": - # Extract keywords from query (simple tokenization) - keywords = [w for w in query.lower().split() if len(w) >= 3] + keywords = _extract_keywords(query) if keywords: with db.readonly() as conn: keyword_rows = _keyword_query( - conn, keywords, limit * 2, languages, paths, exclude_paths + conn, keywords, limit * 3, languages, paths, exclude_paths, ) return _fuse_rrf(vector_results, keyword_rows, limit) - return vector_results - + # For semantic mode, trim to requested limit (vector_fetch_limit may + # have been larger when hybrid was requested but no keywords found). + return vector_results[:limit] diff --git a/tests/test_hybrid_search.py b/tests/test_hybrid_search.py index 1513b04..3fa4b3f 100644 --- a/tests/test_hybrid_search.py +++ b/tests/test_hybrid_search.py @@ -5,7 +5,12 @@ import pytest from cocoindex_code.protocol import SearchRequest, encode_request, decode_request -from cocoindex_code.query import _fuse_rrf, _RRF_K +from cocoindex_code.query import ( + _extract_keywords, + _fuse_rrf, + _RRF_CONSENSUS_BOOST, + _RRF_K, +) from cocoindex_code.schema import QueryResult @@ -49,6 +54,44 @@ def test_encode_decode_search_request_backward_compat() -> None: assert decoded.mode == "semantic" +# --- Keyword extraction tests --- + + +def test_extract_keywords_basic() -> None: + assert _extract_keywords("database connection pool") == ["database", "connection", "pool"] + + +def test_extract_keywords_short_terms() -> None: + """Short but meaningful code terms (io, go, fs, db) should be included.""" + result = _extract_keywords("io read fs db") + assert "io" in result + assert "fs" in result + assert "db" in result + + +def test_extract_keywords_code_like() -> None: + result = _extract_keywords("async_handler error_middleware") + assert "async_handler" in result + assert "error_middleware" in result + + +def test_extract_keywords_single_char_excluded() -> None: + """Single characters should be excluded.""" + result = _extract_keywords("a b c foo") + assert result == ["foo"] + + +def test_extract_keywords_mixed() -> None: + result = _extract_keywords("find io.Reader in go code") + assert "find" in result + assert "io" in result + assert "Reader" not in result # lowercased + assert "reader" in result + assert "go" in result + assert "code" in result + assert "in" in result + + # --- RRF fusion tests --- @@ -88,7 +131,6 @@ def test_fuse_rrf_keyword_only() -> None: def test_fuse_rrf_consensus_boost() -> None: """Items in both lists should rank higher than items in only one.""" - # "a.py:1" appears in both lists vector = [ _make_result("a.py", 1, 0.9), _make_result("b.py", 1, 0.8), @@ -98,7 +140,6 @@ def test_fuse_rrf_consensus_boost() -> None: ("c.py", "python", "# code", 1, 10, 1), ] result = _fuse_rrf(vector, keyword, limit=5) - # a.py:1 should be first (appears in both + consensus boost) assert result[0].file_path == "a.py" @@ -109,9 +150,9 @@ def test_fuse_rrf_respects_limit() -> None: def test_fuse_rrf_score_formula() -> None: - """Verify RRF score follows 1/(k+rank) formula.""" + """Verify RRF score follows 1/(k+rank) formula with consensus boost.""" vector = [_make_result("a.py", 1, 0.9)] keyword = [("a.py", "python", "# code", 1, 10, 1)] result = _fuse_rrf(vector, keyword, limit=1) - expected = 1.0 / (_RRF_K + 1) + 1.0 / (_RRF_K + 1) + 0.003 # both rank 1 + boost + expected = 1.0 / (_RRF_K + 1) + 1.0 / (_RRF_K + 1) + _RRF_CONSENSUS_BOOST assert abs(result[0].score - expected) < 1e-6 From ce7de88bd3253e4fc6f0fd00623e10ee761c0f85 Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 20:59:25 -0300 Subject: [PATCH 4/6] test: add tests for schema and chunking modules - CodeChunk: creation, embedding type flexibility - QueryResult: creation, score range - Chunking exports: Chunk, TextPosition, ChunkerFn, CHUNKER_REGISTRY - 8 new tests, all passing --- src/cocoindex_code/protocol.py | 1 - tests/test_schema_chunking.py | 81 ++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 tests/test_schema_chunking.py diff --git a/src/cocoindex_code/protocol.py b/src/cocoindex_code/protocol.py index b584a4d..cb1d160 100644 --- a/src/cocoindex_code/protocol.py +++ b/src/cocoindex_code/protocol.py @@ -101,7 +101,6 @@ class IndexProgressUpdate(_msgspec.Struct, tag="index_progress"): class IndexWaitingNotice(_msgspec.Struct, tag="index_waiting"): """Sent when another indexing is already in progress and the client must wait.""" - pass class SearchResult(_msgspec.Struct): diff --git a/tests/test_schema_chunking.py b/tests/test_schema_chunking.py new file mode 100644 index 0000000..e60dde2 --- /dev/null +++ b/tests/test_schema_chunking.py @@ -0,0 +1,81 @@ +"""Tests for schema and chunking modules.""" + +from __future__ import annotations + +import numpy as np +import pytest + +from cocoindex_code.schema import CodeChunk, QueryResult +from cocoindex_code.chunking import Chunk, ChunkerFn, CHUNKER_REGISTRY, TextPosition + + +# --- Schema tests --- + + +class TestCodeChunk: + def test_create_code_chunk(self) -> None: + chunk = CodeChunk( + id=1, + file_path="src/main.py", + language="python", + content="def hello(): pass", + start_line=1, + end_line=1, + embedding=np.zeros(384, dtype="float32"), + ) + assert chunk.file_path == "src/main.py" + assert chunk.language == "python" + assert chunk.start_line == 1 + + def test_code_chunk_embedding_accepts_any_type(self) -> None: + """Embedding field should accept various types for compatibility.""" + chunk = CodeChunk( + id=1, file_path="a.py", language="python", + content="x", start_line=1, end_line=1, + embedding=[0.1, 0.2, 0.3], + ) + assert chunk.embedding == [0.1, 0.2, 0.3] + + +class TestQueryResult: + def test_create_query_result(self) -> None: + result = QueryResult( + file_path="src/utils.py", + language="python", + content="def util(): pass", + start_line=10, + end_line=15, + score=0.95, + ) + assert result.score == 0.95 + assert result.end_line == 15 + + def test_query_result_score_range(self) -> None: + """Score should be a float, typically between 0 and 1.""" + result = QueryResult( + file_path="a.py", language="python", + content="x", start_line=1, end_line=1, + score=0.0, + ) + assert result.score == 0.0 + + +# --- Chunking tests --- + + +class TestChunkingExports: + def test_chunk_class_available(self) -> None: + """Chunk should be importable from chunking module.""" + assert Chunk is not None + + def test_text_position_available(self) -> None: + assert TextPosition is not None + + def test_chunker_fn_is_callable_alias(self) -> None: + """ChunkerFn should be a callable type alias.""" + assert ChunkerFn is not None + + def test_chunker_registry_is_context_key(self) -> None: + """CHUNKER_REGISTRY should be a CocoIndex context key.""" + assert CHUNKER_REGISTRY is not None + assert "chunker_registry" in str(CHUNKER_REGISTRY) From 80190fd4b99e4b4c00a1542ac7051364362a2382 Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 20:59:47 -0300 Subject: [PATCH 5/6] fix: correct chunker_registry test assertion --- tests/test_schema_chunking.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_schema_chunking.py b/tests/test_schema_chunking.py index e60dde2..05516fe 100644 --- a/tests/test_schema_chunking.py +++ b/tests/test_schema_chunking.py @@ -78,4 +78,3 @@ def test_chunker_fn_is_callable_alias(self) -> None: def test_chunker_registry_is_context_key(self) -> None: """CHUNKER_REGISTRY should be a CocoIndex context key.""" assert CHUNKER_REGISTRY is not None - assert "chunker_registry" in str(CHUNKER_REGISTRY) From ba2c2b80b9da260333c86335f673b145462cde24 Mon Sep 17 00:00:00 2001 From: Claudio Ferreira Filho Date: Sun, 3 May 2026 21:14:02 -0300 Subject: [PATCH 6/6] =?UTF-8?q?fix:=20address=20Gemini=20review=20?= =?UTF-8?q?=E2=80=94=20remove=20numpy/pytest=20deps=20from=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_schema_chunking.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_schema_chunking.py b/tests/test_schema_chunking.py index 05516fe..67f05f2 100644 --- a/tests/test_schema_chunking.py +++ b/tests/test_schema_chunking.py @@ -2,9 +2,6 @@ from __future__ import annotations -import numpy as np -import pytest - from cocoindex_code.schema import CodeChunk, QueryResult from cocoindex_code.chunking import Chunk, ChunkerFn, CHUNKER_REGISTRY, TextPosition @@ -21,7 +18,7 @@ def test_create_code_chunk(self) -> None: content="def hello(): pass", start_line=1, end_line=1, - embedding=np.zeros(384, dtype="float32"), + embedding=[0.0] * 384, ) assert chunk.file_path == "src/main.py" assert chunk.language == "python"