-
Notifications
You must be signed in to change notification settings - Fork 820
Expand file tree
/
Copy pathmain.py
More file actions
256 lines (207 loc) · 7.93 KB
/
Copy pathmain.py
File metadata and controls
256 lines (207 loc) · 7.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""
Multi-Codebase Summarization - CocoIndex Pipeline Example
This example demonstrates a CocoIndex pipeline that:
1. Scans subdirectories of a root directory (each expected to be a Python project)
2. For each project, extracts:
- Public classes/functions with functionality summaries
- Mermaid graphs for CocoIndex app/function call relationships
- File-level summaries
3. Aggregates per-file extractions into a project summary
4. Outputs markdown documentation to output/PROJECT_NAME.md
"""
from __future__ import annotations
import os
import pathlib
from typing import Collection
import instructor
from litellm import acompletion
import cocoindex as coco
from cocoindex.connectors import localfs
from cocoindex.resources.file import FileLike, PatternFilePathMatcher
from models import CodebaseInfo
LLM_MODEL = os.environ.get("LLM_MODEL", "gemini/gemini-2.5-flash")
_instructor_client = instructor.from_litellm(acompletion, mode=instructor.Mode.JSON)
@coco.fn(memo=True)
async def extract_file_info(file: FileLike) -> CodebaseInfo:
"""Extract structured information from a single Python file using LLM."""
content = await file.read_text()
file_path = str(file.file_path.path)
prompt = f"""Analyze the following Python file and extract structured information.
File path: {file_path}
```python
{content}
```
Instructions:
1. Identify all PUBLIC classes (not starting with _) and summarize their purpose
2. Identify all PUBLIC functions (not starting with _) and summarize their purpose
3. If this file contains CocoIndex apps (coco.App), create Mermaid graphs showing the
function call relationships (see the mermaid_graphs field description for format)
4. Provide a brief summary of the file's purpose
"""
result = await _instructor_client.chat.completions.create(
model=LLM_MODEL,
response_model=CodebaseInfo,
messages=[{"role": "user", "content": prompt}],
)
return CodebaseInfo.model_validate(result.model_dump())
@coco.fn
async def aggregate_project_info(
project_name: str,
file_infos: list[CodebaseInfo],
) -> CodebaseInfo:
"""Aggregate multiple file extractions into a project-level summary."""
if not file_infos:
return CodebaseInfo(
name=project_name, summary="Empty project with no Python files."
)
# Single file - just update the name to be the project name
if len(file_infos) == 1:
info = file_infos[0]
return CodebaseInfo(
name=project_name,
summary=info.summary,
public_classes=info.public_classes,
public_functions=info.public_functions,
mermaid_graphs=info.mermaid_graphs,
)
# Multiple files - use LLM to create aggregated summary
# Format file summaries for the prompt
files_text = "\n\n".join(
f"### {info.name}\n"
f"Summary: {info.summary}\n"
f"Classes: {', '.join(c.name for c in info.public_classes) or 'None'}\n"
f"Functions: {', '.join(f.name for f in info.public_functions) or 'None'}"
for info in file_infos
)
# Collect all mermaid graphs from files
all_graphs = [g for info in file_infos for g in info.mermaid_graphs]
prompt = f"""Aggregate the following Python files into a project-level summary.
Project name: {project_name}
Files:
{files_text}
Create a unified CodebaseInfo that:
1. Summarizes the overall project purpose (not individual files)
2. Lists the most important public classes across all files
3. Lists the most important public functions across all files
4. For mermaid_graphs: create a single unified graph showing how the CocoIndex
components connect across the project (if applicable)
"""
result = await _instructor_client.chat.completions.create(
model=LLM_MODEL,
response_model=CodebaseInfo,
messages=[{"role": "user", "content": prompt}],
)
result = CodebaseInfo.model_validate(result.model_dump())
# Keep original file-level graphs if LLM didn't generate a unified one
if not result.mermaid_graphs and all_graphs:
result.mermaid_graphs = all_graphs
return result
@coco.fn
def generate_markdown(
project_name: str, info: CodebaseInfo, file_infos: list[CodebaseInfo]
) -> str:
"""Generate markdown documentation from project info."""
lines = [
f"# {project_name}",
"",
"## Overview",
"",
info.summary,
"",
]
# Components
if info.public_classes or info.public_functions:
lines.extend(["## Components", ""])
if info.public_classes:
lines.append("**Classes:**")
for cls in info.public_classes:
lines.append(f"- `{cls.name}`: {cls.summary}")
lines.append("")
if info.public_functions:
lines.append("**Functions:**")
for fn in info.public_functions:
marker = " ★" if fn.is_coco_function else ""
lines.append(f"- `{fn.signature}`{marker}: {fn.summary}")
lines.append("")
# Mermaid graphs
if info.mermaid_graphs:
lines.extend(["## CocoIndex Pipeline", ""])
for graph in info.mermaid_graphs:
# Ensure proper code fence wrapping (LLM may or may not include them)
graph_content = graph.strip()
if graph_content.startswith("```"):
# Already has fences, use as-is
lines.append(graph_content)
else:
# Add mermaid code fences
lines.append("```mermaid")
lines.append(graph_content)
lines.append("```")
lines.append("")
# File details (if multiple files)
if len(file_infos) > 1:
lines.extend(["## File Details", ""])
for fi in file_infos:
lines.extend([f"### {fi.name}", "", fi.summary, ""])
# Legend
lines.extend(["---", "", "*★ = CocoIndex function*"])
return "\n".join(lines)
@coco.fn(memo=True)
async def process_project(
project_name: str,
files: Collection[localfs.File],
output_dir: pathlib.Path,
) -> None:
"""Process a single project: extract info from all files, aggregate, and output markdown."""
# Extract info from each file.
file_infos = await coco.map(extract_file_info, files)
# Aggregate into project-level summary
project_info = await aggregate_project_info(project_name, file_infos)
# Generate and output markdown
markdown = generate_markdown(project_name, project_info, file_infos)
localfs.declare_file(
output_dir / f"{project_name}.md", markdown, create_parent_dirs=True
)
@coco.fn
async def app_main(
root_dir: pathlib.Path,
output_dir: pathlib.Path,
) -> None:
"""
Main application function.
Scans subdirectories of root_dir, treating each as a Python project,
and generates markdown documentation for each.
"""
# List subdirectories (each is a project)
for entry in root_dir.resolve().iterdir():
# Skip non-directories and hidden directories
if not entry.is_dir() or entry.name.startswith("."):
continue
project_name = entry.name
# Walk Python files in this project, excluding .venv directories
files = [
f
async for f in localfs.walk_dir(
entry,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["**/*.py"],
excluded_patterns=["**/.*", "**/__pycache__"],
),
)
]
if files:
# Mount a component to process this project
await coco.mount(
coco.component_subpath("project", project_name),
process_project,
project_name,
files,
output_dir,
)
app = coco.App(
coco.AppConfig(name="MultiCodebaseSummarization"),
app_main,
root_dir=pathlib.Path("../"),
output_dir=pathlib.Path("./output"),
)