amd · kovtcharov · Jun 4, 2026 · Jun 4, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -94,6 +94,20 @@ jobs:
     uses: ./.github/workflows/test_browser_agent.yml
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')
 
+  # Test DocQA Agent (standalone hub wheel, #1102)
+  test-docqa-agent:
+    name: DocQA Agent Tests
+    needs: lint
+    uses: ./.github/workflows/test_docqa_agent.yml
+    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')
+
+  # Test Routing Agent (standalone hub wheel, #1102)
+  test-routing-agent:
+    name: Routing Agent Tests
+    needs: lint
+    uses: ./.github/workflows/test_routing_agent.yml
+    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'ready_for_ci')
+
   # Test Email Agent (standalone hub wheel, #1102)
   test-email-agent:
     name: Email Agent Tests
@@ -126,7 +140,7 @@ jobs:
   test-summary:
     name: Test Summary
     runs-on: ubuntu-latest
-    needs: [lint, unit-tests, test-windows, test-linux, test-mcp, test-code-agent, test-chat-agent, test-connectors-demo, test-analyst-agent, test-browser-agent, test-email-agent, test-docqa-agent, test-routing-agent, test-security]
+    needs: [lint, unit-tests, test-windows, test-linux, test-mcp, test-code-agent, test-chat-agent, test-connectors-demo, test-analyst-agent, test-browser-agent, test-docqa-agent, test-routing-agent, test-email-agent, test-security]
     # Run always except when workflow or any dependency is cancelled (e.g., by cancel-in-progress)
     if: >-
       ${{ always() && !cancelled() &&

@@ -102,24 +102,26 @@ gaia chat --query "Hello" --show-stats
 ## Document Q&A (RAG)
 
 <Note>
-RAG (Retrieval-Augmented Generation) enables chatting with PDF and PowerPoint (.pptx) documents using semantic search and context retrieval.
+RAG (Retrieval-Augmented Generation) enables chatting with documents — including PDF, Word (.docx), PowerPoint (.pptx), and Excel (.xlsx) — using semantic search and context retrieval.
 </Note>
 
 ### CLI with RAG
 
 <Tabs>
   <Tab title="Single Document">
     ```bash
-    # Chat with a PDF or PowerPoint document
+    # Chat with a PDF, Word, PowerPoint, or Excel document
     gaia chat --index manual.pdf
+    gaia chat --index handbook.docx
     gaia chat --index slides.pptx
+    gaia chat --index budget.xlsx
     ```
   </Tab>
 
   <Tab title="Multiple Documents">
     ```bash
-    # Chat with multiple documents (PDF and PPTX supported)
-    gaia chat --index doc1.pdf doc2.pdf slides.pptx
+    # Chat with multiple documents (PDF, DOCX, PPTX, XLSX supported)
+    gaia chat --index doc1.pdf report.docx slides.pptx
     ```
   </Tab>
 
@@ -132,7 +134,7 @@ RAG (Retrieval-Augmented Generation) enables chatting with PDF and PowerPoint (.
 
   <Tab title="Watch Folder">
     ```bash
-    # Auto-index every PDF/PPTX in a folder, and any new ones dropped in later
+    # Auto-index every supported document in a folder, and any new ones dropped in later
     gaia chat --watch ./docs
     ```
   </Tab>

@@ -49,6 +49,13 @@ RAG searches your documents and answers using real information.
 - **Augmented** → Add that information to the LLM's context
 - **Generation** → LLM generates an answer using your documents
 
+<Note>
+**Supported document formats:** PDF, Word (`.docx`), PowerPoint (`.pptx`),
+Excel (`.xlsx`), plus plain text, Markdown, CSV, JSON, HTML, and common source
+code files. Legacy binary Office formats (`.doc`, `.ppt`, `.xls`) are not
+supported — re-save them as the modern `.docx` / `.pptx` / `.xlsx` formats.
+</Note>
+
 ---
 
 ## How RAG Works: A Mental Model
@@ -62,7 +69,7 @@ This happens once when you index documents:
 ```mermaid
 %%{init: {'theme':'base', 'themeVariables': { 'primaryColor':'#E2A33E', 'primaryTextColor':'#1a1a1a', 'primaryBorderColor':'#A87B2D', 'lineColor':'#EFC480', 'secondaryColor':'#2d2d2d', 'tertiaryColor':'#f5f5f5', 'fontFamily': 'system-ui, -apple-system, sans-serif'}}}%%
 flowchart TD
-    A(["PDF / PPTX Document"]) --> B(["EXTRACT TEXT"])
+    A(["PDF / DOCX / PPTX / XLSX Document"]) --> B(["EXTRACT TEXT"])
     B --> C(["SPLIT INTO CHUNKS"])
     C --> D(["GENERATE EMBEDDINGS"])
     D --> E[("STORE IN FAISS")]

@@ -146,6 +146,7 @@
             "pymupdf>=1.24.0",
             "pypdf",
             "python-pptx>=0.6.21",
+            "python-docx>=1.1.0",
             "sentence-transformers",
             "safetensors",
             # torch is pinned lower-bound only. The "audio" extra caps
@@ -232,6 +233,7 @@
             "pymupdf>=1.24.0",
             "pypdf",
             "python-pptx>=0.6.21",
+            "python-docx>=1.1.0",
             "sentence-transformers",
         ],
         "lint": [
@@ -266,9 +268,9 @@
         "agent-connectors-demo": ["gaia-agent-connectors-demo"],
         "agent-analyst": ["gaia-agent-analyst"],
         "agent-browser": ["gaia-agent-browser"],
-        "agent-email": ["gaia-agent-email"],
         "agent-docqa": ["gaia-agent-docqa"],
         "agent-routing": ["gaia-agent-routing"],
+        "agent-email": ["gaia-agent-email"],
         "agent-chat": ["gaia-agent-chat"],
         "agents": [
             "gaia-agent-summarize",
@@ -282,9 +284,9 @@
             "gaia-agent-connectors-demo",
             "gaia-agent-analyst",
             "gaia-agent-browser",
-            "gaia-agent-email",
             "gaia-agent-docqa",
             "gaia-agent-routing",
+            "gaia-agent-email",
             "gaia-agent-chat",
         ],
     },

@@ -84,17 +84,18 @@ const UNSUPPORTED_FILE_CATEGORIES: FileTypeCategory[] = [
     },
     {
         label: 'Microsoft Office',
-        extensions: new Set(['.doc', '.docx', '.ppt', '.xls']),
+        extensions: new Set(['.doc', '.ppt', '.xls']),
         message:
-            'Word, legacy PowerPoint (.ppt), and legacy Excel (.xls) files are ' +
-            'not yet supported — GAIA does not currently ship parsers for these ' +
-            'formats. Modern PowerPoint (.pptx) is supported.',
+            'Legacy Office formats (.doc, legacy PowerPoint .ppt, legacy Excel ' +
+            '.xls) are not supported — GAIA reads only the modern XML-based ' +
+            'formats. Modern Word (.docx), PowerPoint (.pptx), and Excel (.xlsx) ' +
+            'are supported.',
         alternatives: [
+            'Re-save as .docx — GAIA indexes modern Word documents directly',
             'Save modern PowerPoint as .pptx — GAIA indexes .pptx directly',
-            'Save Word as PDF, then index the PDF',
             'Re-save legacy .xls workbooks as .xlsx — GAIA supports modern Excel files',
         ],
-        featureTitle: 'Support Microsoft Office (doc, docx, ppt, xls) indexing',
+        featureTitle: 'Support legacy Microsoft Office (doc, ppt, xls) indexing',
     },
 ];
 
@@ -117,11 +118,11 @@ export function getUnsupportedCategory(extension: string): FileTypeCategory | nu
  * ``src/gaia/ui/utils.py``. Only list extensions that have a real extractor
  * in ``src/gaia/rag/sdk.py::_extract_text_from_file`` — listing one without
  * a backend handler causes the RAG pipeline to index binary garbage.
- * .pptx IS supported (python-pptx ships with GAIA); .doc/.docx/.ppt and
- * legacy .xls are intentionally excluded.
+ * .pptx and .docx ARE supported (python-pptx / python-docx ship with GAIA);
+ * legacy .doc/.ppt/.xls are intentionally excluded.
  */
 export const SUPPORTED_EXTENSIONS = new Set([
-    '.pdf', '.pptx', '.txt', '.md', '.csv', '.json', '.xlsx',
+    '.pdf', '.pptx', '.docx', '.txt', '.md', '.csv', '.json', '.xlsx',
     '.html', '.htm', '.xml', '.svg',
     '.yaml', '.yml', '.py', '.js', '.ts', '.java', '.c', '.cpp',
     '.h', '.rs', '.go', '.rb', '.sh', '.bat', '.ps1', '.log',

@@ -32,8 +32,8 @@ describe('getUnsupportedCategory', () => {
         expect(getUnsupportedCategory('.pdf')).toBeNull();
     });
 
-    it('returns "Microsoft Office" for .docx', () => {
-        expect(getUnsupportedCategory('.docx')?.label).toBe('Microsoft Office');
+    it('returns null for .docx — Word indexing is supported', () => {
+        expect(getUnsupportedCategory('.docx')).toBeNull();
     });
 
     it('returns null for .pptx — PowerPoint indexing is supported', () => {
@@ -60,6 +60,10 @@ describe('isExtensionSupported', () => {
         expect(isExtensionSupported('.pptx')).toBe(true);
     });
 
+    it('returns true for .docx — modern Word is indexable', () => {
+        expect(isExtensionSupported('.docx')).toBe(true);
+    });
+
     it('returns false for .exe', () => {
         expect(isExtensionSupported('.exe')).toBe(false);
     });

@@ -1601,6 +1601,188 @@ def _extract_text_from_xlsx(self, xlsx_path: str) -> str:
             self.log.error(f"Error reading Excel file {xlsx_path}: {e}")
             raise
 
+    def _extract_text_from_docx(self, docx_path: str) -> str:
+        """Extract text from a Word (.docx) document using python-docx.
+
+        Walks the document body in order so paragraphs and tables stay
+        interleaved. Paragraph text is collected by walking the inline tree,
+        so runs nested in hyperlinks, **content controls** (``w:sdt`` — the
+        fields used by form/template documents), and textboxes are captured —
+        not just the direct runs that ``Paragraph.text`` exposes. Tabs and
+        line/page breaks become whitespace so adjacent words don't glue
+        together, and the ``mc:Fallback`` (VML) twin of a DrawingML textbox is
+        skipped so shape text isn't emitted twice. Table cells (including
+        tables nested in a cell, and rows/cells wrapped in repeating-section
+        content controls) and block-level content controls are recursed into.
+
+        Corrupt / non-.docx files and a missing ``python-docx`` install raise
+        actionable errors, mirroring :meth:`_extract_text_from_pptx` (.docx is
+        a ZIP container like .pptx).
+
+        Known omissions: header/footer text (separate XML parts, usually
+        repeated boilerplate) and embedded images (TODO #1072: VLM extraction
+        for images embedded in .docx files).
+
+        Returns:
+            The extracted text as a single string.
+        """
+        file_name = Path(docx_path).name
+
+        try:
+            from docx import Document  # pylint: disable=import-outside-toplevel
+            from docx.oxml.ns import qn  # pylint: disable=import-outside-toplevel
+        except ImportError as e:
+            raise ImportError(
+                "python-docx is required for Word document processing. "
+                "Install it with: uv pip install python-docx"
+            ) from e
+
+        # Guard against zip bombs: .docx is a ZIP container. Check the total
+        # uncompressed size is sane before handing it to python-docx.
+        try:
+            with zipfile.ZipFile(docx_path, "r") as zf:
+                total_uncompressed = sum(info.file_size for info in zf.infolist())
+                max_uncompressed = 500 * 1024 * 1024  # 500 MB
+                if total_uncompressed > max_uncompressed:
+                    msg = (
+                        f"Word file too large after decompression: {file_name}\n"
+                        f"Uncompressed size: {total_uncompressed / (1024*1024):.0f} MB "
+                        f"(limit: {max_uncompressed / (1024*1024):.0f} MB)\n"
+                        "The file may be a zip bomb or contain very large embedded media.\n"
+                        "Suggestions:\n"
+                        "  1. Remove unnecessary images/media to reduce file size\n"
+                        "  2. Save as PDF and index the PDF instead"
+                    )
+                    self.log.error(
+                        f"DOCX zip bomb guard: {docx_path} "
+                        f"({total_uncompressed} bytes uncompressed)"
+                    )
+                    raise ValueError(msg)
+        except zipfile.BadZipFile as e:
+            msg = (
+                f"Could not read Word file: {file_name}\n"
+                f"Reason: {e}\n"
+                "The file appears to be corrupted or not a valid .docx file.\n"
+                "Suggestions:\n"
+                "  1. Re-download or re-export the document\n"
+                "  2. Try opening the file in Word to confirm it is readable\n"
+                "  3. Save as PDF and index the PDF instead"
+            )
+            self.log.error(f"Corrupted DOCX (bad zip): {docx_path}: {e}")
+            raise ValueError(msg) from e
+        except OSError as e:
+            # Missing file, a directory, or an unreadable path — surface an
+            # actionable error instead of a raw OSError traceback.
+            msg = (
+                f"Could not read Word file: {file_name}\n"
+                f"Reason: {e}\n"
+                "Check that the path exists and points to a readable .docx file."
+            )
+            self.log.error(f"Cannot open DOCX {docx_path}: {e}")
+            raise ValueError(msg) from e
+
+        try:
+            doc = Document(docx_path)
+        except Exception as e:
+            msg = (
+                f"Could not read Word file: {file_name}\n"
+                f"Reason: {e}\n"
+                "The file appears to be corrupted or not a valid .docx file.\n"
+                "Suggestions:\n"
+                "  1. Re-download or re-export the document\n"
+                "  2. Try opening the file in Word to confirm it is readable\n"
+                "  3. Save as PDF and index the PDF instead"
+            )
+            self.log.error(f"Corrupted DOCX {docx_path}: {e}")
+            raise ValueError(msg) from e
+
+        try:
+            w_p, w_tbl, w_sdt = qn("w:p"), qn("w:tbl"), qn("w:sdt")
+            w_sdt_content = qn("w:sdtContent")
+            w_tr, w_tc = qn("w:tr"), qn("w:tc")
+            w_t, w_tab = qn("w:t"), qn("w:tab")
+            w_br, w_cr = qn("w:br"), qn("w:cr")
+            # ``mc`` (markup-compatibility) isn't in python-docx's prefix map,
+            # so qn() can't resolve it — use the namespace URI directly.
+            mc_fallback = (
+                "{http://schemas.openxmlformats.org/markup-compatibility/2006}"
+                "Fallback"
+            )
+
+            def _para_runs(elem, out):
+                # Walk a paragraph's inline tree in document order, translating
+                # leaf elements to text. Descends through runs, hyperlinks,
+                # inline content controls (w:sdt), and textbox content so their
+                # text is captured — but skips ``mc:Fallback`` (the VML twin of
+                # a DrawingML textbox) so shape text is not emitted twice.
+                for child in elem:
+                    tag = child.tag
+                    if tag == w_t:
+                        out.append(child.text or "")
+                    elif tag == w_tab:
+                        out.append("\t")
+                    elif tag in (w_br, w_cr):
+                        out.append("\n")
+                    elif tag == mc_fallback:
+                        continue
+                    else:
+                        _para_runs(child, out)
+
+            def _paragraph_text(p_elem):
+                out = []
+                _para_runs(p_elem, out)
+                return "".join(out).strip()
+
+            def _iter_children(elem, wanted):
+                # Yield direct ``wanted`` children, transparently descending
+                # through w:sdt wrappers (repeating-section / row / cell content
+                # controls keep their rows and cells inside w:sdtContent).
+                for child in elem:
+                    if child.tag == wanted:
+                        yield child
+                    elif child.tag == w_sdt:
+                        content = child.find(w_sdt_content)
+                        if content is not None:
+                            yield from _iter_children(content, wanted)
+
+            def _emit(elem, parts):
+                # Recursively walk a block element in document order, appending
+                # text fragments. Unknown tags (section props, etc.) are skipped.
+                if elem.tag == w_p:
+                    para_text = _paragraph_text(elem)
+                    if para_text:
+                        parts.append(para_text)
+                elif elem.tag == w_tbl:
+                    for row in _iter_children(elem, w_tr):
+                        cells = []
+                        for cell in _iter_children(row, w_tc):
+                            cell_parts = []
+                            for cell_child in cell:
+                                _emit(cell_child, cell_parts)
+                            cells.append(" ".join(cell_parts).strip())
+                        if any(cells):
+                            parts.append(" | ".join(cells))
+                elif elem.tag == w_sdt:
+                    # Block-level content control — descend into its content.
+                    content = elem.find(w_sdt_content)
+                    if content is not None:
+                        for sdt_child in content:
+                            _emit(sdt_child, parts)
+
+            parts = []
+            for child in doc.element.body:
+                _emit(child, parts)
+
+            text = "\n".join(parts)
+
+            if self.config.show_stats:
+                print(f"  ✅ Loaded Word file ({len(text):,} chars)")
+            self.log.info(f"📄 Extracted {len(text):,} characters from Word file")
+            return text
+        except Exception as e:
+            self.log.error(f"Error reading Word file {docx_path}: {e}")
+            raise
+
     def _extract_text_from_file(self, file_path: str) -> tuple:
         """
         Extract text from file based on type.
@@ -1646,6 +1828,10 @@ def _extract_text_from_file(self, file_path: str) -> tuple:
         elif file_type in [".xlsx", ".xls"]:
             return self._extract_text_from_xlsx(file_path), metadata
 
+        # Word files
+        elif file_type == ".docx":
+            return self._extract_text_from_docx(file_path), metadata
+
         # Code files (treat as text for Q&A purposes)
         elif file_type in [
             # Backend languages