Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b057902
refactor(agents): migrate docqa + routing to hub (#1102)
Jun 4, 2026
dc71d62
fix(agents): repoint gaia-agent-code at relocated RoutingAgent (#1102)
Jun 4, 2026
42d3220
Merge remote-tracking branch 'origin/main' into claudia/task-8fa7ecef
Jun 10, 2026
7b7379f
Merge remote-tracking branch 'origin/main' into claudia/task-8fa7ecef
Jun 10, 2026
1674258
Merge branch 'main' into claudia/task-8fa7ecef
kovtcharov-amd Jun 10, 2026
3b81417
Merge branch 'main' into claudia/task-8fa7ecef
kovtcharov-amd Jun 11, 2026
7fc433d
Merge remote-tracking branch 'origin/main' into claudia/task-8fa7ecef
kovtcharov Jun 17, 2026
633e365
fix(agents): complete docqa+routing migration cleanup (#1102)
kovtcharov Jun 17, 2026
f5e3586
ci(code-agent): install local routing wheel before code agent (#1102)
kovtcharov Jun 17, 2026
81b07b8
ci(api): install routing+code wheels so gaia-code API path works (#1102)
kovtcharov Jun 17, 2026
28a643a
Merge remote-tracking branch 'origin/main' into pr1455-update
kovtcharov Jun 19, 2026
8fa14a5
docs(agents): fix stale docqa/routing paths flagged in review (#1102)
kovtcharov Jun 19, 2026
e8f5f45
Merge remote-tracking branch 'origin/main' into pr1455-update
kovtcharov Jun 19, 2026
17221cc
Merge remote-tracking branch 'origin/main' into pr1455-update
kovtcharov Jun 19, 2026
d952a0b
feat(rag): index Microsoft Word (.docx) documents
kovtcharov Jun 25, 2026
5dc899a
fix(rag): harden .docx extraction against retrieval-poisoning edge cases
kovtcharov Jun 25, 2026
86855ff
Merge remote-tracking branch 'origin/main' into feat/rag-docx-indexing
kovtcharov Jun 25, 2026
fa01b9d
docs(rag): move .docx TODO out of docstring body (review nit)
kovtcharov Jun 25, 2026
0a62aa2
Merge branch 'main' into feat/rag-docx-indexing
kovtcharov Jun 25, 2026
8a11a0d
Merge branch 'main' into feat/rag-docx-indexing
Jun 29, 2026
fa94ee8
style(tests): drop stray blank line from merge resolution
Jun 29, 2026
25cbb4d
docs(chat): show .docx/.xlsx in RAG --index examples
Jun 29, 2026
8cd9209
fix(setup): drop duplicate agent-docqa/agent-routing extras keys
Jun 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .github/workflows/test_gaia_cli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ jobs:
uses: ./.github/workflows/test_browser_agent.yml
if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')

# Test DocQA Agent (standalone hub wheel, #1102)
test-docqa-agent:
name: DocQA Agent Tests
needs: lint
uses: ./.github/workflows/test_docqa_agent.yml
if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')

# Test Routing Agent (standalone hub wheel, #1102)
test-routing-agent:
name: Routing Agent Tests
needs: lint
uses: ./.github/workflows/test_routing_agent.yml
if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')

# Test Email Agent (standalone hub wheel, #1102)
test-email-agent:
name: Email Agent Tests
Expand Down Expand Up @@ -126,7 +140,7 @@ jobs:
test-summary:
name: Test Summary
runs-on: ubuntu-latest
needs: [lint, unit-tests, test-windows, test-linux, test-mcp, test-code-agent, test-chat-agent, test-connectors-demo, test-analyst-agent, test-browser-agent, test-email-agent, test-docqa-agent, test-routing-agent, test-security]
needs: [lint, unit-tests, test-windows, test-linux, test-mcp, test-code-agent, test-chat-agent, test-connectors-demo, test-analyst-agent, test-browser-agent, test-docqa-agent, test-routing-agent, test-email-agent, test-security]
# Run always except when workflow or any dependency is cancelled (e.g., by cancel-in-progress)
if: >-
${{ always() && !cancelled() &&
Expand Down
12 changes: 7 additions & 5 deletions docs/guides/chat.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -102,24 +102,26 @@ gaia chat --query "Hello" --show-stats
## Document Q&A (RAG)

<Note>
RAG (Retrieval-Augmented Generation) enables chatting with PDF and PowerPoint (.pptx) documents using semantic search and context retrieval.
RAG (Retrieval-Augmented Generation) enables chatting with documents — including PDF, Word (.docx), PowerPoint (.pptx), and Excel (.xlsx) — using semantic search and context retrieval.
</Note>

### CLI with RAG

<Tabs>
<Tab title="Single Document">
```bash
# Chat with a PDF or PowerPoint document
# Chat with a PDF, Word, PowerPoint, or Excel document
gaia chat --index manual.pdf
gaia chat --index handbook.docx
gaia chat --index slides.pptx
gaia chat --index budget.xlsx
```
</Tab>

<Tab title="Multiple Documents">
```bash
# Chat with multiple documents (PDF and PPTX supported)
gaia chat --index doc1.pdf doc2.pdf slides.pptx
# Chat with multiple documents (PDF, DOCX, PPTX, XLSX supported)
gaia chat --index doc1.pdf report.docx slides.pptx
```
</Tab>

Expand All @@ -132,7 +134,7 @@ RAG (Retrieval-Augmented Generation) enables chatting with PDF and PowerPoint (.

<Tab title="Watch Folder">
```bash
# Auto-index every PDF/PPTX in a folder, and any new ones dropped in later
# Auto-index every supported document in a folder, and any new ones dropped in later
gaia chat --watch ./docs
```
</Tab>
Expand Down
9 changes: 8 additions & 1 deletion docs/sdk/sdks/rag.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ RAG searches your documents and answers using real information.
- **Augmented** → Add that information to the LLM's context
- **Generation** → LLM generates an answer using your documents

<Note>
**Supported document formats:** PDF, Word (`.docx`), PowerPoint (`.pptx`),
Excel (`.xlsx`), plus plain text, Markdown, CSV, JSON, HTML, and common source
code files. Legacy binary Office formats (`.doc`, `.ppt`, `.xls`) are not
supported — re-save them as the modern `.docx` / `.pptx` / `.xlsx` formats.
</Note>

---

## How RAG Works: A Mental Model
Expand All @@ -62,7 +69,7 @@ This happens once when you index documents:
```mermaid
%%{init: {'theme':'base', 'themeVariables': { 'primaryColor':'#E2A33E', 'primaryTextColor':'#1a1a1a', 'primaryBorderColor':'#A87B2D', 'lineColor':'#EFC480', 'secondaryColor':'#2d2d2d', 'tertiaryColor':'#f5f5f5', 'fontFamily': 'system-ui, -apple-system, sans-serif'}}}%%
flowchart TD
A(["PDF / PPTX Document"]) --> B(["EXTRACT TEXT"])
A(["PDF / DOCX / PPTX / XLSX Document"]) --> B(["EXTRACT TEXT"])
B --> C(["SPLIT INTO CHUNKS"])
C --> D(["GENERATE EMBEDDINGS"])
D --> E[("STORE IN FAISS")]
Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
"pymupdf>=1.24.0",
"pypdf",
"python-pptx>=0.6.21",
"python-docx>=1.1.0",
"sentence-transformers",
"safetensors",
# torch is pinned lower-bound only. The "audio" extra caps
Expand Down Expand Up @@ -232,6 +233,7 @@
"pymupdf>=1.24.0",
"pypdf",
"python-pptx>=0.6.21",
"python-docx>=1.1.0",
"sentence-transformers",
],
"lint": [
Expand Down Expand Up @@ -266,9 +268,9 @@
"agent-connectors-demo": ["gaia-agent-connectors-demo"],
"agent-analyst": ["gaia-agent-analyst"],
"agent-browser": ["gaia-agent-browser"],
"agent-email": ["gaia-agent-email"],
"agent-docqa": ["gaia-agent-docqa"],
"agent-routing": ["gaia-agent-routing"],
"agent-email": ["gaia-agent-email"],
"agent-chat": ["gaia-agent-chat"],
"agents": [
"gaia-agent-summarize",
Expand All @@ -282,9 +284,9 @@
"gaia-agent-connectors-demo",
"gaia-agent-analyst",
"gaia-agent-browser",
"gaia-agent-email",
"gaia-agent-docqa",
"gaia-agent-routing",
"gaia-agent-email",
"gaia-agent-chat",
],
},
Expand Down
19 changes: 10 additions & 9 deletions src/gaia/apps/webui/src/components/UnsupportedFeature.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,18 @@ const UNSUPPORTED_FILE_CATEGORIES: FileTypeCategory[] = [
},
{
label: 'Microsoft Office',
extensions: new Set(['.doc', '.docx', '.ppt', '.xls']),
extensions: new Set(['.doc', '.ppt', '.xls']),
message:
'Word, legacy PowerPoint (.ppt), and legacy Excel (.xls) files are ' +
'not yet supported — GAIA does not currently ship parsers for these ' +
'formats. Modern PowerPoint (.pptx) is supported.',
'Legacy Office formats (.doc, legacy PowerPoint .ppt, legacy Excel ' +
'.xls) are not supported — GAIA reads only the modern XML-based ' +
'formats. Modern Word (.docx), PowerPoint (.pptx), and Excel (.xlsx) ' +
'are supported.',
alternatives: [
'Re-save as .docx — GAIA indexes modern Word documents directly',
'Save modern PowerPoint as .pptx — GAIA indexes .pptx directly',
'Save Word as PDF, then index the PDF',
'Re-save legacy .xls workbooks as .xlsx — GAIA supports modern Excel files',
],
featureTitle: 'Support Microsoft Office (doc, docx, ppt, xls) indexing',
featureTitle: 'Support legacy Microsoft Office (doc, ppt, xls) indexing',
},
];

Expand All @@ -117,11 +118,11 @@ export function getUnsupportedCategory(extension: string): FileTypeCategory | nu
* ``src/gaia/ui/utils.py``. Only list extensions that have a real extractor
* in ``src/gaia/rag/sdk.py::_extract_text_from_file`` — listing one without
* a backend handler causes the RAG pipeline to index binary garbage.
* .pptx IS supported (python-pptx ships with GAIA); .doc/.docx/.ppt and
* legacy .xls are intentionally excluded.
* .pptx and .docx ARE supported (python-pptx / python-docx ship with GAIA);
* legacy .doc/.ppt/.xls are intentionally excluded.
*/
export const SUPPORTED_EXTENSIONS = new Set([
'.pdf', '.pptx', '.txt', '.md', '.csv', '.json', '.xlsx',
'.pdf', '.pptx', '.docx', '.txt', '.md', '.csv', '.json', '.xlsx',
'.html', '.htm', '.xml', '.svg',
'.yaml', '.yml', '.py', '.js', '.ts', '.java', '.c', '.cpp',
'.h', '.rs', '.go', '.rb', '.sh', '.bat', '.ps1', '.log',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ describe('getUnsupportedCategory', () => {
expect(getUnsupportedCategory('.pdf')).toBeNull();
});

it('returns "Microsoft Office" for .docx', () => {
expect(getUnsupportedCategory('.docx')?.label).toBe('Microsoft Office');
it('returns null for .docx — Word indexing is supported', () => {
expect(getUnsupportedCategory('.docx')).toBeNull();
});

it('returns null for .pptx — PowerPoint indexing is supported', () => {
Expand All @@ -60,6 +60,10 @@ describe('isExtensionSupported', () => {
expect(isExtensionSupported('.pptx')).toBe(true);
});

it('returns true for .docx — modern Word is indexable', () => {
expect(isExtensionSupported('.docx')).toBe(true);
});

it('returns false for .exe', () => {
expect(isExtensionSupported('.exe')).toBe(false);
});
Expand Down
186 changes: 186 additions & 0 deletions src/gaia/rag/sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,188 @@ def _extract_text_from_xlsx(self, xlsx_path: str) -> str:
self.log.error(f"Error reading Excel file {xlsx_path}: {e}")
raise

def _extract_text_from_docx(self, docx_path: str) -> str:
"""Extract text from a Word (.docx) document using python-docx.

Walks the document body in order so paragraphs and tables stay
interleaved. Paragraph text is collected by walking the inline tree,
so runs nested in hyperlinks, **content controls** (``w:sdt`` — the
fields used by form/template documents), and textboxes are captured —
not just the direct runs that ``Paragraph.text`` exposes. Tabs and
line/page breaks become whitespace so adjacent words don't glue
together, and the ``mc:Fallback`` (VML) twin of a DrawingML textbox is
skipped so shape text isn't emitted twice. Table cells (including
tables nested in a cell, and rows/cells wrapped in repeating-section
content controls) and block-level content controls are recursed into.

Corrupt / non-.docx files and a missing ``python-docx`` install raise
actionable errors, mirroring :meth:`_extract_text_from_pptx` (.docx is
a ZIP container like .pptx).

Known omissions: header/footer text (separate XML parts, usually
repeated boilerplate) and embedded images (TODO #1072: VLM extraction
for images embedded in .docx files).

Returns:
The extracted text as a single string.
"""
file_name = Path(docx_path).name

try:
from docx import Document # pylint: disable=import-outside-toplevel
from docx.oxml.ns import qn # pylint: disable=import-outside-toplevel
except ImportError as e:
raise ImportError(
"python-docx is required for Word document processing. "
"Install it with: uv pip install python-docx"
) from e

# Guard against zip bombs: .docx is a ZIP container. Check the total
# uncompressed size is sane before handing it to python-docx.
try:
with zipfile.ZipFile(docx_path, "r") as zf:
total_uncompressed = sum(info.file_size for info in zf.infolist())
max_uncompressed = 500 * 1024 * 1024 # 500 MB
if total_uncompressed > max_uncompressed:
msg = (
f"Word file too large after decompression: {file_name}\n"
f"Uncompressed size: {total_uncompressed / (1024*1024):.0f} MB "
f"(limit: {max_uncompressed / (1024*1024):.0f} MB)\n"
"The file may be a zip bomb or contain very large embedded media.\n"
"Suggestions:\n"
" 1. Remove unnecessary images/media to reduce file size\n"
" 2. Save as PDF and index the PDF instead"
)
self.log.error(
f"DOCX zip bomb guard: {docx_path} "
f"({total_uncompressed} bytes uncompressed)"
)
raise ValueError(msg)
except zipfile.BadZipFile as e:
msg = (
f"Could not read Word file: {file_name}\n"
f"Reason: {e}\n"
"The file appears to be corrupted or not a valid .docx file.\n"
"Suggestions:\n"
" 1. Re-download or re-export the document\n"
" 2. Try opening the file in Word to confirm it is readable\n"
" 3. Save as PDF and index the PDF instead"
)
self.log.error(f"Corrupted DOCX (bad zip): {docx_path}: {e}")
raise ValueError(msg) from e
except OSError as e:
# Missing file, a directory, or an unreadable path — surface an
# actionable error instead of a raw OSError traceback.
msg = (
f"Could not read Word file: {file_name}\n"
f"Reason: {e}\n"
"Check that the path exists and points to a readable .docx file."
)
self.log.error(f"Cannot open DOCX {docx_path}: {e}")
raise ValueError(msg) from e

try:
doc = Document(docx_path)
except Exception as e:
msg = (
f"Could not read Word file: {file_name}\n"
f"Reason: {e}\n"
"The file appears to be corrupted or not a valid .docx file.\n"
"Suggestions:\n"
" 1. Re-download or re-export the document\n"
" 2. Try opening the file in Word to confirm it is readable\n"
" 3. Save as PDF and index the PDF instead"
)
self.log.error(f"Corrupted DOCX {docx_path}: {e}")
raise ValueError(msg) from e

try:
w_p, w_tbl, w_sdt = qn("w:p"), qn("w:tbl"), qn("w:sdt")
w_sdt_content = qn("w:sdtContent")
w_tr, w_tc = qn("w:tr"), qn("w:tc")
w_t, w_tab = qn("w:t"), qn("w:tab")
w_br, w_cr = qn("w:br"), qn("w:cr")
# ``mc`` (markup-compatibility) isn't in python-docx's prefix map,
# so qn() can't resolve it — use the namespace URI directly.
mc_fallback = (
"{http://schemas.openxmlformats.org/markup-compatibility/2006}"
"Fallback"
)

def _para_runs(elem, out):
# Walk a paragraph's inline tree in document order, translating
# leaf elements to text. Descends through runs, hyperlinks,
# inline content controls (w:sdt), and textbox content so their
# text is captured — but skips ``mc:Fallback`` (the VML twin of
# a DrawingML textbox) so shape text is not emitted twice.
for child in elem:
tag = child.tag
if tag == w_t:
out.append(child.text or "")
elif tag == w_tab:
out.append("\t")
elif tag in (w_br, w_cr):
out.append("\n")
elif tag == mc_fallback:
continue
else:
_para_runs(child, out)

def _paragraph_text(p_elem):
out = []
_para_runs(p_elem, out)
return "".join(out).strip()

def _iter_children(elem, wanted):
# Yield direct ``wanted`` children, transparently descending
# through w:sdt wrappers (repeating-section / row / cell content
# controls keep their rows and cells inside w:sdtContent).
for child in elem:
if child.tag == wanted:
yield child
elif child.tag == w_sdt:
content = child.find(w_sdt_content)
if content is not None:
yield from _iter_children(content, wanted)

def _emit(elem, parts):
# Recursively walk a block element in document order, appending
# text fragments. Unknown tags (section props, etc.) are skipped.
if elem.tag == w_p:
para_text = _paragraph_text(elem)
if para_text:
parts.append(para_text)
elif elem.tag == w_tbl:
for row in _iter_children(elem, w_tr):
cells = []
for cell in _iter_children(row, w_tc):
cell_parts = []
for cell_child in cell:
_emit(cell_child, cell_parts)
cells.append(" ".join(cell_parts).strip())
if any(cells):
parts.append(" | ".join(cells))
elif elem.tag == w_sdt:
# Block-level content control — descend into its content.
content = elem.find(w_sdt_content)
if content is not None:
for sdt_child in content:
_emit(sdt_child, parts)

parts = []
for child in doc.element.body:
_emit(child, parts)

text = "\n".join(parts)

if self.config.show_stats:
print(f" ✅ Loaded Word file ({len(text):,} chars)")
self.log.info(f"📄 Extracted {len(text):,} characters from Word file")
return text
except Exception as e:
self.log.error(f"Error reading Word file {docx_path}: {e}")
raise

def _extract_text_from_file(self, file_path: str) -> tuple:
"""
Extract text from file based on type.
Expand Down Expand Up @@ -1646,6 +1828,10 @@ def _extract_text_from_file(self, file_path: str) -> tuple:
elif file_type in [".xlsx", ".xls"]:
return self._extract_text_from_xlsx(file_path), metadata

# Word files
elif file_type == ".docx":
return self._extract_text_from_docx(file_path), metadata

# Code files (treat as text for Q&A purposes)
elif file_type in [
# Backend languages
Expand Down
Loading
Loading