From 52c5651232534cb938b09b8f1311a4ccb223ae4f Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 6 Jun 2026 08:53:27 -0700 Subject: [PATCH 1/4] fix(scraper): bound playwright cleanup Prevent Playwright render failures from leaving page or browser cleanup promises unbounded. Reset the browser after render failures and force-kill Chromium if browser close itself hangs.\n\nCo-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../HtmlPlaywrightMiddleware.test.ts | 73 +++++++++ .../middleware/HtmlPlaywrightMiddleware.ts | 151 ++++++++++++++++-- 2 files changed, 210 insertions(+), 14 deletions(-) diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index 3df8de02..ecff1f64 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -251,6 +251,79 @@ describe("HtmlPlaywrightMiddleware", () => { launchSpy.mockRestore(); }); + it("should reset the browser after a Playwright render timeout", async () => { + const initialHtml = "

Test

"; + const context = createPipelineTestContext(initialHtml, "https://example.com/test"); + const next = vi.fn(); + + const pageSpy = createMockPlaywrightPage(initialHtml); + pageSpy.waitForSelector = vi + .fn() + .mockRejectedValue(new Error("Timeout 5000ms exceeded")); + const contextSpy = { + newPage: vi.fn().mockResolvedValue(pageSpy), + close: vi.fn().mockResolvedValue(undefined), + }; + const browserSpy = { + newContext: vi.fn().mockResolvedValue(contextSpy), + isConnected: vi.fn().mockReturnValue(true), + on: vi.fn(), + close: vi.fn().mockResolvedValue(undefined), + } as unknown as MockedObject; + const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + + await playwrightMiddleware.process(context, next); + + expect(context.errors).toHaveLength(1); + expect(context.errors[0].message).toContain("Timeout 5000ms exceeded"); + expect(pageSpy.close).toHaveBeenCalled(); + expect(contextSpy.close).toHaveBeenCalled(); + expect(browserSpy.close).toHaveBeenCalled(); + expect(next).toHaveBeenCalled(); + + launchSpy.mockRestore(); + }); + + it("should bound hung page cleanup and reset the browser", async () => { + vi.useFakeTimers(); + try { + const initialHtml = "

Test

"; + const context = createPipelineTestContext( + initialHtml, + "https://example.com/test", + ); + const next = vi.fn(); + + const pageSpy = createMockPlaywrightPage(initialHtml); + pageSpy.close = vi.fn().mockImplementation(() => new Promise(() => {})); + const contextSpy = { + newPage: vi.fn().mockResolvedValue(pageSpy), + close: vi.fn().mockResolvedValue(undefined), + }; + const browserSpy = { + newContext: vi.fn().mockResolvedValue(contextSpy), + isConnected: vi.fn().mockReturnValue(true), + on: vi.fn(), + close: vi.fn().mockResolvedValue(undefined), + } as unknown as MockedObject; + const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + + const processPromise = playwrightMiddleware.process(context, next); + await vi.advanceTimersByTimeAsync(5000); + await expect(processPromise).resolves.toBeUndefined(); + + expect(context.errors).toHaveLength(0); + expect(pageSpy.close).toHaveBeenCalled(); + expect(contextSpy.close).toHaveBeenCalled(); + expect(browserSpy.close).toHaveBeenCalled(); + expect(next).toHaveBeenCalled(); + + launchSpy.mockRestore(); + } finally { + vi.useRealTimers(); + } + }); + it("should skip processing when scrapeMode is not playwright/auto", async () => { const initialHtml = "

Test

"; const context = createPipelineTestContext(initialHtml, "https://example.com/test", { diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 4b35fed0..df2dd96e 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -40,6 +40,16 @@ interface CachedResource { contentType: string; } +interface BrowserProcessHandle { + killed: boolean; + pid?: number; + kill(signal?: NodeJS.Signals | number): boolean; +} + +interface BrowserWithProcessHandle extends Browser { + process(): BrowserProcessHandle | null; +} + /** * Middleware to process HTML content using Playwright for rendering dynamic content, * *if* the scrapeMode option requires it ('playwright' or 'auto'). @@ -116,20 +126,74 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { return this.browser; } + private getPageTimeoutMs(): number { + return this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs; + } + + private getBrowserTimeoutMs(): number { + return this.config.browserTimeoutMs ?? defaults.scraper.browserTimeoutMs; + } + + private async withTimeout( + operation: Promise, + timeoutMs: number, + description: string, + ): Promise { + let timeout: NodeJS.Timeout | undefined; + try { + return await Promise.race([ + operation, + new Promise((_resolve, reject) => { + timeout = setTimeout(() => { + reject(new Error(`${description} timed out after ${timeoutMs}ms`)); + }, timeoutMs); + }), + ]); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + } + + private getBrowserProcess(browser: Browser): BrowserProcessHandle | null { + if (!("process" in browser) || typeof browser.process !== "function") { + return null; + } + + return (browser as BrowserWithProcessHandle).process(); + } + + private killBrowserProcess(browser: Browser, reason: string): void { + const browserProcess = this.getBrowserProcess(browser); + if (!browserProcess || browserProcess.killed) { + return; + } + + const pid = browserProcess.pid ? ` pid=${browserProcess.pid}` : ""; + logger.warn(`⚠️ Force killing Playwright browser process${pid}: ${reason}`); + browserProcess.kill("SIGKILL"); + } + /** * Closes the Playwright browser instance if it exists. * Should be called during application shutdown. * Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes. */ - async closeBrowser(): Promise { + async closeBrowser(reason = "shutdown"): Promise { if (this.browser) { + const browser = this.browser; try { - logger.debug("Closing Playwright browser instance..."); + logger.debug(`Closing Playwright browser instance (${reason})...`); // Always attempt to close, even if disconnected, to reap zombie processes - await this.browser.close(); + await this.withTimeout( + browser.close(), + this.getPageTimeoutMs(), + "Playwright browser close", + ); } catch (error) { - // Log error but don't throw - cleanup should be non-fatal - logger.warn(`⚠️ Error closing Playwright browser: ${error}`); + logger.warn(`⚠️ Error closing Playwright browser (${reason}): ${error}`); + this.killBrowserProcess(browser, reason); } finally { // Always set to null to allow fresh browser on next request this.browser = null; @@ -137,6 +201,54 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { } } + private async cleanupPageAndContext( + page: Page | null, + browserContext: BrowserContext | null, + source: string, + ): Promise { + let cleanupSucceeded = true; + const timeoutMs = this.getPageTimeoutMs(); + + if (page) { + try { + await this.withTimeout( + page.unroute("**/*"), + timeoutMs, + `Playwright route cleanup for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright route cleanup failed for ${source}: ${error}`); + } + + try { + await this.withTimeout( + page.close(), + timeoutMs, + `Playwright page close for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright page cleanup failed for ${source}: ${error}`); + } + } + + if (browserContext) { + try { + await this.withTimeout( + browserContext.close(), + timeoutMs, + `Playwright browser context close for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright context cleanup failed for ${source}: ${error}`); + } + } + + return cleanupSucceeded; + } + /** * Injects the shadow DOM extractor script into the page. * This script performs non-invasive extraction that preserves document structure. @@ -979,6 +1091,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { let page: Page | null = null; let browserContext: BrowserContext | null = null; let renderedHtml: string | null = null; + let shouldResetBrowser = false; // Extract credentials and origin using helper const { credentials, origin } = extractCredentialsAndOrigin(context.source); @@ -1052,11 +1165,15 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { ); }); + // Wait for either body (normal HTML) or frameset (frameset documents) to appear + const pageTimeoutMs = this.getPageTimeoutMs(); + // Load initial HTML content - await page.goto(context.source, { waitUntil: "load" }); + await page.goto(context.source, { + waitUntil: "load", + timeout: this.getBrowserTimeoutMs(), + }); - // Wait for either body (normal HTML) or frameset (frameset documents) to appear - const pageTimeoutMs = this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs; await page.waitForSelector("body, frameset", { timeout: pageTimeoutMs, }); @@ -1081,6 +1198,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { `Playwright: Successfully rendered content for ${context.source} using ${method}`, ); } catch (error) { + shouldResetBrowser = true; logger.error(`❌ Playwright failed to render ${context.source}: ${error}`); context.errors.push( error instanceof Error @@ -1088,13 +1206,18 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { : new Error(`Playwright rendering failed: ${String(error)}`), ); } finally { - // Ensure page/context are closed even if subsequent steps fail - if (page) { - await page.unroute("**/*"); - await page.close(); + const cleanupSucceeded = await this.cleanupPageAndContext( + page, + browserContext, + context.source, + ); + if (!cleanupSucceeded) { + shouldResetBrowser = true; } - if (browserContext) { - await browserContext.close(); + if (shouldResetBrowser) { + await this.closeBrowser( + `reset after Playwright render failure for ${context.source}`, + ); } } From dbafb87ea60cb9d1b8c405b636cbc84d9a0cdf16 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 6 Jun 2026 10:30:13 -0700 Subject: [PATCH 2/4] fix(scraper): split oversized markdown tables Avoid the Joplin GFM table conversion path hanging on very large tables by splitting oversized tables into smaller row chunks before Turndown runs. This preserves full table content while bounding per-table conversion work. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../HtmlToMarkdownMiddleware.test.ts | 26 ++++++ .../middleware/HtmlToMarkdownMiddleware.ts | 93 +++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 176e4948..ffa6ad91 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -127,6 +127,32 @@ describe("HtmlToMarkdownMiddleware", () => { // No close needed }); + it("should split oversized tables before GFM conversion while retaining content", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const rows = Array.from( + { length: 505 }, + (_, index) => + `Row ${index + 1}Value ${index + 1}`, + ).join(""); + const html = ` + + + + ${rows} +
NameValue
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(context.content).toContain("| Row 1 | **Value 1** |"); + expect(context.content).toContain("| Row 505 | **Value 505** |"); + expect(context.content.match(/\| Name \| Value \|/g)).toHaveLength(6); + }); + it("should return empty string and markdown type if conversion results in empty markdown", async () => { const middleware = new HtmlToMarkdownMiddleware(); // HTML that results in empty markdown (only comments) diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts index cbcf76d3..ec16650b 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts @@ -1,10 +1,17 @@ // @ts-expect-error import { gfm } from "@joplin/turndown-plugin-gfm"; +import type * as cheerio from "cheerio"; +import type { Element } from "domhandler"; import TurndownService from "turndown"; import { logger } from "../../utils/logger"; import { fullTrim } from "../../utils/string"; import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; +const maxGfmTableRows = 500; +const maxGfmTableCells = 1_000; +const maxGfmTableHtmlLength = 100_000; +const gfmTableChunkRows = 100; + /** * Middleware to convert the final processed HTML content (from Cheerio object in context.dom) * into Markdown using Turndown, applying custom rules. @@ -100,6 +107,91 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { return fullTrim(content).replace(/[ \t]*[\r\n]+[ \t]*/g, " "); } + private splitOversizedTables($: cheerio.CheerioAPI, source: string): void { + $("table").each((_index, element) => { + const $table = $(element); + const rowCount = $table.find("tr").length; + const cellCount = $table.find("td, th").length; + const htmlLength = $.html(element).length; + + if ( + rowCount <= maxGfmTableRows && + cellCount <= maxGfmTableCells && + htmlLength <= maxGfmTableHtmlLength + ) { + return; + } + + const replacement = this.buildSplitTableHtml($, $table); + if (!replacement) { + return; + } + + logger.warn( + `⚠️ Splitting oversized HTML table for ${source} before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`, + ); + $table.replaceWith(replacement); + }); + } + + private buildSplitTableHtml( + $: cheerio.CheerioAPI, + $table: cheerio.Cheerio, + ): string | null { + const headerRows = this.getHeaderRows($table); + const headerRowSet = new Set(headerRows); + const dataRows = $table + .find("tr") + .toArray() + .filter((row): row is Element => row.type === "tag" && !headerRowSet.has(row)); + + if (dataRows.length <= gfmTableChunkRows) { + return null; + } + + const chunks: string[] = []; + const headerHtml = headerRows.map((row) => $.html(row)).join(""); + + for (let index = 0; index < dataRows.length; index += gfmTableChunkRows) { + const rowsHtml = dataRows + .slice(index, index + gfmTableChunkRows) + .map((row) => $.html(row)) + .join(""); + const tableParts = [""]; + if (headerHtml) { + tableParts.push("", headerHtml, ""); + } + tableParts.push("", rowsHtml, "
"); + chunks.push(tableParts.join("")); + } + + return chunks.join("\n"); + } + + private getHeaderRows($table: cheerio.Cheerio): Element[] { + const theadRows = $table + .children("thead") + .find("tr") + .toArray() + .filter((row): row is Element => row.type === "tag"); + + if (theadRows.length > 0) { + return theadRows; + } + + const firstRow = $table.find("tr").first(); + if (firstRow.length === 0) { + return []; + } + + const cells = firstRow.children("th, td").toArray(); + const isHeaderRow = + cells.length > 0 && + cells.every((cell) => cell.type === "tag" && cell.tagName === "th"); + + return isHeaderRow ? (firstRow.toArray() as Element[]) : []; + } + /** * Processes the context to convert the sanitized HTML body node to Markdown. * @param context The current processing context. @@ -119,6 +211,7 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { // Only process if we have a Cheerio object (implicitly means it's HTML) try { logger.debug(`Converting HTML content to Markdown for ${context.source}`); + this.splitOversizedTables($, context.source); // Provide Turndown with the HTML string content from the Cheerio object's body, // or the whole document if body is empty/unavailable. const htmlToConvert = $("body").html() || $.html(); From 68ec6465e4ee18015a2568d73939a12491cd62a1 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 6 Jun 2026 10:33:42 -0700 Subject: [PATCH 3/4] fix(scraper): guard browser kill failures Keep forced Playwright process cleanup from throwing out of failure-handling paths when SIGKILL delivery itself fails or reports no target. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/scraper/middleware/HtmlPlaywrightMiddleware.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index df2dd96e..4d36c78d 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -172,7 +172,13 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { const pid = browserProcess.pid ? ` pid=${browserProcess.pid}` : ""; logger.warn(`⚠️ Force killing Playwright browser process${pid}: ${reason}`); - browserProcess.kill("SIGKILL"); + try { + if (!browserProcess.kill("SIGKILL")) { + logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}`); + } + } catch (error) { + logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}: ${error}`); + } } /** From dd0f971895404fdd5f41b291f1700735735db5d2 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 6 Jun 2026 10:56:53 -0700 Subject: [PATCH 4/4] fix(scraper): refine table splitting fallback Preserve caption and colgroup metadata when chunking large tables, keep unsplittable oversized tables as HTML via placeholders, and make Playwright reset reasons distinguish render and cleanup failures. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../HtmlPlaywrightMiddleware.test.ts | 10 ++ .../middleware/HtmlPlaywrightMiddleware.ts | 14 ++- .../HtmlToMarkdownMiddleware.test.ts | 96 +++++++++++++++++++ .../middleware/HtmlToMarkdownMiddleware.ts | 44 ++++++++- 4 files changed, 159 insertions(+), 5 deletions(-) diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index ecff1f64..f4609619 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -271,6 +271,7 @@ describe("HtmlPlaywrightMiddleware", () => { close: vi.fn().mockResolvedValue(undefined), } as unknown as MockedObject; const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser"); await playwrightMiddleware.process(context, next); @@ -279,8 +280,12 @@ describe("HtmlPlaywrightMiddleware", () => { expect(pageSpy.close).toHaveBeenCalled(); expect(contextSpy.close).toHaveBeenCalled(); expect(browserSpy.close).toHaveBeenCalled(); + expect(closeBrowserSpy).toHaveBeenCalledWith( + "reset after Playwright render failure for https://example.com/test", + ); expect(next).toHaveBeenCalled(); + closeBrowserSpy.mockRestore(); launchSpy.mockRestore(); }); @@ -307,6 +312,7 @@ describe("HtmlPlaywrightMiddleware", () => { close: vi.fn().mockResolvedValue(undefined), } as unknown as MockedObject; const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser"); const processPromise = playwrightMiddleware.process(context, next); await vi.advanceTimersByTimeAsync(5000); @@ -316,8 +322,12 @@ describe("HtmlPlaywrightMiddleware", () => { expect(pageSpy.close).toHaveBeenCalled(); expect(contextSpy.close).toHaveBeenCalled(); expect(browserSpy.close).toHaveBeenCalled(); + expect(closeBrowserSpy).toHaveBeenCalledWith( + "reset after Playwright cleanup failure for https://example.com/test", + ); expect(next).toHaveBeenCalled(); + closeBrowserSpy.mockRestore(); launchSpy.mockRestore(); } finally { vi.useRealTimers(); diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 4d36c78d..a241e7bc 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -1098,6 +1098,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { let browserContext: BrowserContext | null = null; let renderedHtml: string | null = null; let shouldResetBrowser = false; + let renderFailed = false; // Extract credentials and origin using helper const { credentials, origin } = extractCredentialsAndOrigin(context.source); @@ -1204,6 +1205,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { `Playwright: Successfully rendered content for ${context.source} using ${method}`, ); } catch (error) { + renderFailed = true; shouldResetBrowser = true; logger.error(`❌ Playwright failed to render ${context.source}: ${error}`); context.errors.push( @@ -1217,13 +1219,17 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { browserContext, context.source, ); - if (!cleanupSucceeded) { + const cleanupFailed = !cleanupSucceeded; + if (cleanupFailed) { shouldResetBrowser = true; } if (shouldResetBrowser) { - await this.closeBrowser( - `reset after Playwright render failure for ${context.source}`, - ); + const resetReason = renderFailed + ? cleanupFailed + ? `reset after Playwright render and cleanup failure for ${context.source}` + : `reset after Playwright render failure for ${context.source}` + : `reset after Playwright cleanup failure for ${context.source}`; + await this.closeBrowser(resetReason); } } diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index ffa6ad91..cde6c154 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -153,6 +153,102 @@ describe("HtmlToMarkdownMiddleware", () => { expect(context.content.match(/\| Name \| Value \|/g)).toHaveLength(6); }); + it("should preserve captions and colgroups when splitting oversized tables", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const rows = Array.from( + { length: 505 }, + (_, index) => `Row ${index + 1}Value ${index + 1}`, + ).join(""); + const html = ` + + + + + + ${rows} +
Release matrix
NameValue
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + const originalTurndown = TurndownService.prototype.turndown; + let convertedHtml = ""; + const turndownSpy = vi + .spyOn(TurndownService.prototype, "turndown") + .mockImplementation(function ( + this: TurndownService, + input: Parameters[0], + ) { + convertedHtml = String(input); + return originalTurndown.call(this, String(input)); + }); + + try { + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(convertedHtml.match(/Release matrix<\/caption>/g)).toHaveLength(6); + expect(convertedHtml.match(/<\/colgroup>/g)).toHaveLength( + 6, + ); + expect(convertedHtml).toContain( + '', + ); + expect(context.content.match(/Release matrix/g)).toHaveLength(6); + expect(context.content).toContain("| Row 505 | Value 505 |"); + } finally { + turndownSpy.mockRestore(); + } + }); + + it("should preserve oversized tables as HTML when row splitting is not possible", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const headerCells = Array.from( + { length: 1001 }, + (_, index) => ``, + ).join(""); + const dataCells = Array.from( + { length: 1001 }, + (_, index) => ``, + ).join(""); + const html = ` + +
Release matrix
Header ${index + 1}Value ${index + 1}
+ ${headerCells} + ${dataCells} +
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + const originalTurndown = TurndownService.prototype.turndown; + let convertedHtml = ""; + const turndownSpy = vi + .spyOn(TurndownService.prototype, "turndown") + .mockImplementation(function ( + this: TurndownService, + input: Parameters[0], + ) { + convertedHtml = String(input); + return originalTurndown.call(this, String(input)); + }); + + try { + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(convertedHtml).toContain("data-docs-mcp-preserved-table-id"); + expect(convertedHtml).not.toContain("Header 1"); + expect(context.content).toContain(""); + expect(context.content).toContain(""); + expect(context.content).toContain(""); + expect(context.content).not.toContain("data-docs-mcp-preserved-table-id"); + expect(context.content).not.toContain("| Header 1 |"); + } finally { + turndownSpy.mockRestore(); + } + }); + it("should return empty string and markdown type if conversion results in empty markdown", async () => { const middleware = new HtmlToMarkdownMiddleware(); // HTML that results in empty markdown (only comments) diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts index ec16650b..a913fe0f 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts @@ -11,6 +11,7 @@ const maxGfmTableRows = 500; const maxGfmTableCells = 1_000; const maxGfmTableHtmlLength = 100_000; const gfmTableChunkRows = 100; +const preservedTablePlaceholderAttribute = "data-docs-mcp-preserved-table-id"; /** * Middleware to convert the final processed HTML content (from Cheerio object in context.dom) @@ -18,6 +19,8 @@ const gfmTableChunkRows = 100; */ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { private turndownService: TurndownService; + private readonly preservedTableHtml = new Map(); + private preservedTableSequence = 0; constructor() { this.turndownService = new TurndownService({ @@ -101,6 +104,21 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { return `[${normalizedContent}](${href})`; // Standard link conversion }, }); + this.turndownService.addRule("preservedOversizedTable", { + filter: (node: Node) => { + const element = node as HTMLElement; + return ( + element.nodeName === "DIV" && + element.hasAttribute(preservedTablePlaceholderAttribute) + ); + }, + replacement: (_content: string, node: Node) => { + const element = node as HTMLElement; + const tableId = element.getAttribute(preservedTablePlaceholderAttribute); + const tableHtml = tableId ? this.preservedTableHtml.get(tableId) : undefined; + return tableHtml ? `\n\n${tableHtml}\n\n` : ""; + }, + }); } private normalizeLinkContent(content: string): string { @@ -124,6 +142,14 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { const replacement = this.buildSplitTableHtml($, $table); if (!replacement) { + const tableId = `table-${++this.preservedTableSequence}`; + this.preservedTableHtml.set(tableId, $.html(element)); + logger.warn( + `⚠️ Preserving oversized HTML table for ${source} as HTML before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`, + ); + $table.replaceWith( + `
preserved table
`, + ); return; } @@ -151,13 +177,14 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { const chunks: string[] = []; const headerHtml = headerRows.map((row) => $.html(row)).join(""); + const preservedChildHtml = this.getPreservedTableChildHtml($, $table); for (let index = 0; index < dataRows.length; index += gfmTableChunkRows) { const rowsHtml = dataRows .slice(index, index + gfmTableChunkRows) .map((row) => $.html(row)) .join(""); - const tableParts = ["
Header 1Value 1001
"]; + const tableParts = ["
", preservedChildHtml]; if (headerHtml) { tableParts.push("", headerHtml, ""); } @@ -168,6 +195,18 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { return chunks.join("\n"); } + private getPreservedTableChildHtml( + $: cheerio.CheerioAPI, + $table: cheerio.Cheerio, + ): string { + return $table + .children("caption, colgroup") + .toArray() + .filter((child): child is Element => child.type === "tag") + .map((child) => $.html(child)) + .join(""); + } + private getHeaderRows($table: cheerio.Cheerio): Element[] { const theadRows = $table .children("thead") @@ -211,6 +250,7 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { // Only process if we have a Cheerio object (implicitly means it's HTML) try { logger.debug(`Converting HTML content to Markdown for ${context.source}`); + this.preservedTableHtml.clear(); this.splitOversizedTables($, context.source); // Provide Turndown with the HTML string content from the Cheerio object's body, // or the whole document if body is empty/unavailable. @@ -240,6 +280,8 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { ), ); // Decide if pipeline should stop? For now, continue. + } finally { + this.preservedTableHtml.clear(); } // Call the next middleware in the chain regardless of whether conversion happened