cocoindex/examples/multi_codebase_summarization/main.py at main · cocoindex-io/cocoindex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""
Multi-Codebase Summarization - CocoIndex Pipeline Example

This example demonstrates a CocoIndex pipeline that:
1. Scans subdirectories of a root directory (each expected to be a Python project)
2. For each project, extracts:
   - Public classes/functions with functionality summaries
   - Mermaid graphs for CocoIndex app/function call relationships
   - File-level summaries
3. Aggregates per-file extractions into a project summary
4. Outputs markdown documentation to output/PROJECT_NAME.md
"""

from __future__ import annotations

import os
import pathlib
from typing import Collection

import instructor
from litellm import acompletion

import cocoindex as coco
from cocoindex.connectors import localfs
from cocoindex.resources.file import FileLike, PatternFilePathMatcher

from models import CodebaseInfo


LLM_MODEL = os.environ.get("LLM_MODEL", "gemini/gemini-2.5-flash")

_instructor_client = instructor.from_litellm(acompletion, mode=instructor.Mode.JSON)


@coco.fn(memo=True)
async def extract_file_info(file: FileLike) -> CodebaseInfo:
    """Extract structured information from a single Python file using LLM."""
    content = await file.read_text()
    file_path = str(file.file_path.path)

    prompt = f"""Analyze the following Python file and extract structured information.

File path: {file_path}

```python
{content}
```

Instructions:
1. Identify all PUBLIC classes (not starting with _) and summarize their purpose
2. Identify all PUBLIC functions (not starting with _) and summarize their purpose
3. If this file contains CocoIndex apps (coco.App), create Mermaid graphs showing the
   function call relationships (see the mermaid_graphs field description for format)
4. Provide a brief summary of the file's purpose
"""

    result = await _instructor_client.chat.completions.create(
        model=LLM_MODEL,
        response_model=CodebaseInfo,
        messages=[{"role": "user", "content": prompt}],
    )
    return CodebaseInfo.model_validate(result.model_dump())


@coco.fn
async def aggregate_project_info(
    project_name: str,
    file_infos: list[CodebaseInfo],
) -> CodebaseInfo:
    """Aggregate multiple file extractions into a project-level summary."""
    if not file_infos:
        return CodebaseInfo(
            name=project_name, summary="Empty project with no Python files."
        )

    # Single file - just update the name to be the project name
    if len(file_infos) == 1:
        info = file_infos[0]
        return CodebaseInfo(
            name=project_name,
            summary=info.summary,
            public_classes=info.public_classes,
            public_functions=info.public_functions,
            mermaid_graphs=info.mermaid_graphs,
        )

    # Multiple files - use LLM to create aggregated summary

    # Format file summaries for the prompt
    files_text = "\n\n".join(
        f"### {info.name}\n"
        f"Summary: {info.summary}\n"
        f"Classes: {', '.join(c.name for c in info.public_classes) or 'None'}\n"
        f"Functions: {', '.join(f.name for f in info.public_functions) or 'None'}"
        for info in file_infos
    )

    # Collect all mermaid graphs from files
    all_graphs = [g for info in file_infos for g in info.mermaid_graphs]

    prompt = f"""Aggregate the following Python files into a project-level summary.

Project name: {project_name}

Files:
{files_text}

Create a unified CodebaseInfo that:
1. Summarizes the overall project purpose (not individual files)
2. Lists the most important public classes across all files
3. Lists the most important public functions across all files
4. For mermaid_graphs: create a single unified graph showing how the CocoIndex
   components connect across the project (if applicable)
"""

    result = await _instructor_client.chat.completions.create(
        model=LLM_MODEL,
        response_model=CodebaseInfo,
        messages=[{"role": "user", "content": prompt}],
    )
    result = CodebaseInfo.model_validate(result.model_dump())

    # Keep original file-level graphs if LLM didn't generate a unified one
    if not result.mermaid_graphs and all_graphs:
        result.mermaid_graphs = all_graphs

    return result


@coco.fn
def generate_markdown(
    project_name: str, info: CodebaseInfo, file_infos: list[CodebaseInfo]
) -> str:
    """Generate markdown documentation from project info."""
    lines = [
        f"# {project_name}",
        "",
        "## Overview",
        "",
        info.summary,
        "",
    ]

    # Components
    if info.public_classes or info.public_functions:
        lines.extend(["## Components", ""])

        if info.public_classes:
            lines.append("**Classes:**")
            for cls in info.public_classes:
                lines.append(f"- `{cls.name}`: {cls.summary}")
            lines.append("")

        if info.public_functions:
            lines.append("**Functions:**")
            for fn in info.public_functions:
                marker = " ★" if fn.is_coco_function else ""
                lines.append(f"- `{fn.signature}`{marker}: {fn.summary}")
            lines.append("")

    # Mermaid graphs
    if info.mermaid_graphs:
        lines.extend(["## CocoIndex Pipeline", ""])
        for graph in info.mermaid_graphs:
            # Ensure proper code fence wrapping (LLM may or may not include them)
            graph_content = graph.strip()
            if graph_content.startswith("```"):
                # Already has fences, use as-is
                lines.append(graph_content)
            else:
                # Add mermaid code fences
                lines.append("```mermaid")
                lines.append(graph_content)
                lines.append("```")
            lines.append("")

    # File details (if multiple files)
    if len(file_infos) > 1:
        lines.extend(["## File Details", ""])
        for fi in file_infos:
            lines.extend([f"### {fi.name}", "", fi.summary, ""])

    # Legend
    lines.extend(["---", "", "*★ = CocoIndex function*"])

    return "\n".join(lines)


@coco.fn(memo=True)
async def process_project(
    project_name: str,
    files: Collection[localfs.File],
    output_dir: pathlib.Path,
) -> None:
    """Process a single project: extract info from all files, aggregate, and output markdown."""
    # Extract info from each file.
    file_infos = await coco.map(extract_file_info, files)

    # Aggregate into project-level summary
    project_info = await aggregate_project_info(project_name, file_infos)

    # Generate and output markdown
    markdown = generate_markdown(project_name, project_info, file_infos)
    localfs.declare_file(
        output_dir / f"{project_name}.md", markdown, create_parent_dirs=True
    )


@coco.fn
async def app_main(
    root_dir: pathlib.Path,
    output_dir: pathlib.Path,
) -> None:
    """
    Main application function.

    Scans subdirectories of root_dir, treating each as a Python project,
    and generates markdown documentation for each.
    """
    # List subdirectories (each is a project)
    for entry in root_dir.resolve().iterdir():
        # Skip non-directories and hidden directories
        if not entry.is_dir() or entry.name.startswith("."):
            continue
        project_name = entry.name

        # Walk Python files in this project, excluding .venv directories
        files = [
            f
            async for f in localfs.walk_dir(
                entry,
                recursive=True,
                path_matcher=PatternFilePathMatcher(
                    included_patterns=["**/*.py"],
                    excluded_patterns=["**/.*", "**/__pycache__"],
                ),
            )
        ]

        if files:
            # Mount a component to process this project
            await coco.mount(
                coco.component_subpath("project", project_name),
                process_project,
                project_name,
                files,
                output_dir,
            )


app = coco.App(
    coco.AppConfig(name="MultiCodebaseSummarization"),
    app_main,
    root_dir=pathlib.Path("../"),
    output_dir=pathlib.Path("./output"),
)