diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index 3df8de02..f4609619 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -251,6 +251,89 @@ describe("HtmlPlaywrightMiddleware", () => { launchSpy.mockRestore(); }); + it("should reset the browser after a Playwright render timeout", async () => { + const initialHtml = "

Test

"; + const context = createPipelineTestContext(initialHtml, "https://example.com/test"); + const next = vi.fn(); + + const pageSpy = createMockPlaywrightPage(initialHtml); + pageSpy.waitForSelector = vi + .fn() + .mockRejectedValue(new Error("Timeout 5000ms exceeded")); + const contextSpy = { + newPage: vi.fn().mockResolvedValue(pageSpy), + close: vi.fn().mockResolvedValue(undefined), + }; + const browserSpy = { + newContext: vi.fn().mockResolvedValue(contextSpy), + isConnected: vi.fn().mockReturnValue(true), + on: vi.fn(), + close: vi.fn().mockResolvedValue(undefined), + } as unknown as MockedObject; + const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser"); + + await playwrightMiddleware.process(context, next); + + expect(context.errors).toHaveLength(1); + expect(context.errors[0].message).toContain("Timeout 5000ms exceeded"); + expect(pageSpy.close).toHaveBeenCalled(); + expect(contextSpy.close).toHaveBeenCalled(); + expect(browserSpy.close).toHaveBeenCalled(); + expect(closeBrowserSpy).toHaveBeenCalledWith( + "reset after Playwright render failure for https://example.com/test", + ); + expect(next).toHaveBeenCalled(); + + closeBrowserSpy.mockRestore(); + launchSpy.mockRestore(); + }); + + it("should bound hung page cleanup and reset the browser", async () => { + vi.useFakeTimers(); + try { + const initialHtml = "

Test

"; + const context = createPipelineTestContext( + initialHtml, + "https://example.com/test", + ); + const next = vi.fn(); + + const pageSpy = createMockPlaywrightPage(initialHtml); + pageSpy.close = vi.fn().mockImplementation(() => new Promise(() => {})); + const contextSpy = { + newPage: vi.fn().mockResolvedValue(pageSpy), + close: vi.fn().mockResolvedValue(undefined), + }; + const browserSpy = { + newContext: vi.fn().mockResolvedValue(contextSpy), + isConnected: vi.fn().mockReturnValue(true), + on: vi.fn(), + close: vi.fn().mockResolvedValue(undefined), + } as unknown as MockedObject; + const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); + const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser"); + + const processPromise = playwrightMiddleware.process(context, next); + await vi.advanceTimersByTimeAsync(5000); + await expect(processPromise).resolves.toBeUndefined(); + + expect(context.errors).toHaveLength(0); + expect(pageSpy.close).toHaveBeenCalled(); + expect(contextSpy.close).toHaveBeenCalled(); + expect(browserSpy.close).toHaveBeenCalled(); + expect(closeBrowserSpy).toHaveBeenCalledWith( + "reset after Playwright cleanup failure for https://example.com/test", + ); + expect(next).toHaveBeenCalled(); + + closeBrowserSpy.mockRestore(); + launchSpy.mockRestore(); + } finally { + vi.useRealTimers(); + } + }); + it("should skip processing when scrapeMode is not playwright/auto", async () => { const initialHtml = "

Test

"; const context = createPipelineTestContext(initialHtml, "https://example.com/test", { diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 4b35fed0..a241e7bc 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -40,6 +40,16 @@ interface CachedResource { contentType: string; } +interface BrowserProcessHandle { + killed: boolean; + pid?: number; + kill(signal?: NodeJS.Signals | number): boolean; +} + +interface BrowserWithProcessHandle extends Browser { + process(): BrowserProcessHandle | null; +} + /** * Middleware to process HTML content using Playwright for rendering dynamic content, * *if* the scrapeMode option requires it ('playwright' or 'auto'). @@ -116,20 +126,80 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { return this.browser; } + private getPageTimeoutMs(): number { + return this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs; + } + + private getBrowserTimeoutMs(): number { + return this.config.browserTimeoutMs ?? defaults.scraper.browserTimeoutMs; + } + + private async withTimeout( + operation: Promise, + timeoutMs: number, + description: string, + ): Promise { + let timeout: NodeJS.Timeout | undefined; + try { + return await Promise.race([ + operation, + new Promise((_resolve, reject) => { + timeout = setTimeout(() => { + reject(new Error(`${description} timed out after ${timeoutMs}ms`)); + }, timeoutMs); + }), + ]); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + } + + private getBrowserProcess(browser: Browser): BrowserProcessHandle | null { + if (!("process" in browser) || typeof browser.process !== "function") { + return null; + } + + return (browser as BrowserWithProcessHandle).process(); + } + + private killBrowserProcess(browser: Browser, reason: string): void { + const browserProcess = this.getBrowserProcess(browser); + if (!browserProcess || browserProcess.killed) { + return; + } + + const pid = browserProcess.pid ? ` pid=${browserProcess.pid}` : ""; + logger.warn(`⚠️ Force killing Playwright browser process${pid}: ${reason}`); + try { + if (!browserProcess.kill("SIGKILL")) { + logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}`); + } + } catch (error) { + logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}: ${error}`); + } + } + /** * Closes the Playwright browser instance if it exists. * Should be called during application shutdown. * Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes. */ - async closeBrowser(): Promise { + async closeBrowser(reason = "shutdown"): Promise { if (this.browser) { + const browser = this.browser; try { - logger.debug("Closing Playwright browser instance..."); + logger.debug(`Closing Playwright browser instance (${reason})...`); // Always attempt to close, even if disconnected, to reap zombie processes - await this.browser.close(); + await this.withTimeout( + browser.close(), + this.getPageTimeoutMs(), + "Playwright browser close", + ); } catch (error) { - // Log error but don't throw - cleanup should be non-fatal - logger.warn(`⚠️ Error closing Playwright browser: ${error}`); + logger.warn(`⚠️ Error closing Playwright browser (${reason}): ${error}`); + this.killBrowserProcess(browser, reason); } finally { // Always set to null to allow fresh browser on next request this.browser = null; @@ -137,6 +207,54 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { } } + private async cleanupPageAndContext( + page: Page | null, + browserContext: BrowserContext | null, + source: string, + ): Promise { + let cleanupSucceeded = true; + const timeoutMs = this.getPageTimeoutMs(); + + if (page) { + try { + await this.withTimeout( + page.unroute("**/*"), + timeoutMs, + `Playwright route cleanup for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright route cleanup failed for ${source}: ${error}`); + } + + try { + await this.withTimeout( + page.close(), + timeoutMs, + `Playwright page close for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright page cleanup failed for ${source}: ${error}`); + } + } + + if (browserContext) { + try { + await this.withTimeout( + browserContext.close(), + timeoutMs, + `Playwright browser context close for ${source}`, + ); + } catch (error) { + cleanupSucceeded = false; + logger.warn(`⚠️ Playwright context cleanup failed for ${source}: ${error}`); + } + } + + return cleanupSucceeded; + } + /** * Injects the shadow DOM extractor script into the page. * This script performs non-invasive extraction that preserves document structure. @@ -979,6 +1097,8 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { let page: Page | null = null; let browserContext: BrowserContext | null = null; let renderedHtml: string | null = null; + let shouldResetBrowser = false; + let renderFailed = false; // Extract credentials and origin using helper const { credentials, origin } = extractCredentialsAndOrigin(context.source); @@ -1052,11 +1172,15 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { ); }); + // Wait for either body (normal HTML) or frameset (frameset documents) to appear + const pageTimeoutMs = this.getPageTimeoutMs(); + // Load initial HTML content - await page.goto(context.source, { waitUntil: "load" }); + await page.goto(context.source, { + waitUntil: "load", + timeout: this.getBrowserTimeoutMs(), + }); - // Wait for either body (normal HTML) or frameset (frameset documents) to appear - const pageTimeoutMs = this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs; await page.waitForSelector("body, frameset", { timeout: pageTimeoutMs, }); @@ -1081,6 +1205,8 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { `Playwright: Successfully rendered content for ${context.source} using ${method}`, ); } catch (error) { + renderFailed = true; + shouldResetBrowser = true; logger.error(`❌ Playwright failed to render ${context.source}: ${error}`); context.errors.push( error instanceof Error @@ -1088,13 +1214,22 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { : new Error(`Playwright rendering failed: ${String(error)}`), ); } finally { - // Ensure page/context are closed even if subsequent steps fail - if (page) { - await page.unroute("**/*"); - await page.close(); + const cleanupSucceeded = await this.cleanupPageAndContext( + page, + browserContext, + context.source, + ); + const cleanupFailed = !cleanupSucceeded; + if (cleanupFailed) { + shouldResetBrowser = true; } - if (browserContext) { - await browserContext.close(); + if (shouldResetBrowser) { + const resetReason = renderFailed + ? cleanupFailed + ? `reset after Playwright render and cleanup failure for ${context.source}` + : `reset after Playwright render failure for ${context.source}` + : `reset after Playwright cleanup failure for ${context.source}`; + await this.closeBrowser(resetReason); } } diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 176e4948..cde6c154 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -127,6 +127,128 @@ describe("HtmlToMarkdownMiddleware", () => { // No close needed }); + it("should split oversized tables before GFM conversion while retaining content", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const rows = Array.from( + { length: 505 }, + (_, index) => + `Row ${index + 1}Value ${index + 1}`, + ).join(""); + const html = ` + + + + ${rows} +
NameValue
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(context.content).toContain("| Row 1 | **Value 1** |"); + expect(context.content).toContain("| Row 505 | **Value 505** |"); + expect(context.content.match(/\| Name \| Value \|/g)).toHaveLength(6); + }); + + it("should preserve captions and colgroups when splitting oversized tables", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const rows = Array.from( + { length: 505 }, + (_, index) => `Row ${index + 1}Value ${index + 1}`, + ).join(""); + const html = ` + + + + + + ${rows} +
Release matrix
NameValue
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + const originalTurndown = TurndownService.prototype.turndown; + let convertedHtml = ""; + const turndownSpy = vi + .spyOn(TurndownService.prototype, "turndown") + .mockImplementation(function ( + this: TurndownService, + input: Parameters[0], + ) { + convertedHtml = String(input); + return originalTurndown.call(this, String(input)); + }); + + try { + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(convertedHtml.match(/Release matrix<\/caption>/g)).toHaveLength(6); + expect(convertedHtml.match(/<\/colgroup>/g)).toHaveLength( + 6, + ); + expect(convertedHtml).toContain( + '', + ); + expect(context.content.match(/Release matrix/g)).toHaveLength(6); + expect(context.content).toContain("| Row 505 | Value 505 |"); + } finally { + turndownSpy.mockRestore(); + } + }); + + it("should preserve oversized tables as HTML when row splitting is not possible", async () => { + const middleware = new HtmlToMarkdownMiddleware(); + const headerCells = Array.from( + { length: 1001 }, + (_, index) => ``, + ).join(""); + const dataCells = Array.from( + { length: 1001 }, + (_, index) => ``, + ).join(""); + const html = ` + +
Release matrix
Header ${index + 1}Value ${index + 1}
+ ${headerCells} + ${dataCells} +
+ `; + const context = createMockContext(html); + const next = vi.fn().mockResolvedValue(undefined); + const originalTurndown = TurndownService.prototype.turndown; + let convertedHtml = ""; + const turndownSpy = vi + .spyOn(TurndownService.prototype, "turndown") + .mockImplementation(function ( + this: TurndownService, + input: Parameters[0], + ) { + convertedHtml = String(input); + return originalTurndown.call(this, String(input)); + }); + + try { + await middleware.process(context, next); + + expect(next).toHaveBeenCalledOnce(); + expect(context.errors).toHaveLength(0); + expect(convertedHtml).toContain("data-docs-mcp-preserved-table-id"); + expect(convertedHtml).not.toContain("Header 1"); + expect(context.content).toContain(""); + expect(context.content).toContain(""); + expect(context.content).toContain(""); + expect(context.content).not.toContain("data-docs-mcp-preserved-table-id"); + expect(context.content).not.toContain("| Header 1 |"); + } finally { + turndownSpy.mockRestore(); + } + }); + it("should return empty string and markdown type if conversion results in empty markdown", async () => { const middleware = new HtmlToMarkdownMiddleware(); // HTML that results in empty markdown (only comments) diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts index cbcf76d3..a913fe0f 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts @@ -1,16 +1,26 @@ // @ts-expect-error import { gfm } from "@joplin/turndown-plugin-gfm"; +import type * as cheerio from "cheerio"; +import type { Element } from "domhandler"; import TurndownService from "turndown"; import { logger } from "../../utils/logger"; import { fullTrim } from "../../utils/string"; import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; +const maxGfmTableRows = 500; +const maxGfmTableCells = 1_000; +const maxGfmTableHtmlLength = 100_000; +const gfmTableChunkRows = 100; +const preservedTablePlaceholderAttribute = "data-docs-mcp-preserved-table-id"; + /** * Middleware to convert the final processed HTML content (from Cheerio object in context.dom) * into Markdown using Turndown, applying custom rules. */ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { private turndownService: TurndownService; + private readonly preservedTableHtml = new Map(); + private preservedTableSequence = 0; constructor() { this.turndownService = new TurndownService({ @@ -94,12 +104,133 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { return `[${normalizedContent}](${href})`; // Standard link conversion }, }); + this.turndownService.addRule("preservedOversizedTable", { + filter: (node: Node) => { + const element = node as HTMLElement; + return ( + element.nodeName === "DIV" && + element.hasAttribute(preservedTablePlaceholderAttribute) + ); + }, + replacement: (_content: string, node: Node) => { + const element = node as HTMLElement; + const tableId = element.getAttribute(preservedTablePlaceholderAttribute); + const tableHtml = tableId ? this.preservedTableHtml.get(tableId) : undefined; + return tableHtml ? `\n\n${tableHtml}\n\n` : ""; + }, + }); } private normalizeLinkContent(content: string): string { return fullTrim(content).replace(/[ \t]*[\r\n]+[ \t]*/g, " "); } + private splitOversizedTables($: cheerio.CheerioAPI, source: string): void { + $("table").each((_index, element) => { + const $table = $(element); + const rowCount = $table.find("tr").length; + const cellCount = $table.find("td, th").length; + const htmlLength = $.html(element).length; + + if ( + rowCount <= maxGfmTableRows && + cellCount <= maxGfmTableCells && + htmlLength <= maxGfmTableHtmlLength + ) { + return; + } + + const replacement = this.buildSplitTableHtml($, $table); + if (!replacement) { + const tableId = `table-${++this.preservedTableSequence}`; + this.preservedTableHtml.set(tableId, $.html(element)); + logger.warn( + `⚠️ Preserving oversized HTML table for ${source} as HTML before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`, + ); + $table.replaceWith( + `
preserved table
`, + ); + return; + } + + logger.warn( + `⚠️ Splitting oversized HTML table for ${source} before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`, + ); + $table.replaceWith(replacement); + }); + } + + private buildSplitTableHtml( + $: cheerio.CheerioAPI, + $table: cheerio.Cheerio, + ): string | null { + const headerRows = this.getHeaderRows($table); + const headerRowSet = new Set(headerRows); + const dataRows = $table + .find("tr") + .toArray() + .filter((row): row is Element => row.type === "tag" && !headerRowSet.has(row)); + + if (dataRows.length <= gfmTableChunkRows) { + return null; + } + + const chunks: string[] = []; + const headerHtml = headerRows.map((row) => $.html(row)).join(""); + const preservedChildHtml = this.getPreservedTableChildHtml($, $table); + + for (let index = 0; index < dataRows.length; index += gfmTableChunkRows) { + const rowsHtml = dataRows + .slice(index, index + gfmTableChunkRows) + .map((row) => $.html(row)) + .join(""); + const tableParts = ["
Header 1Value 1001
", preservedChildHtml]; + if (headerHtml) { + tableParts.push("", headerHtml, ""); + } + tableParts.push("", rowsHtml, "
"); + chunks.push(tableParts.join("")); + } + + return chunks.join("\n"); + } + + private getPreservedTableChildHtml( + $: cheerio.CheerioAPI, + $table: cheerio.Cheerio, + ): string { + return $table + .children("caption, colgroup") + .toArray() + .filter((child): child is Element => child.type === "tag") + .map((child) => $.html(child)) + .join(""); + } + + private getHeaderRows($table: cheerio.Cheerio): Element[] { + const theadRows = $table + .children("thead") + .find("tr") + .toArray() + .filter((row): row is Element => row.type === "tag"); + + if (theadRows.length > 0) { + return theadRows; + } + + const firstRow = $table.find("tr").first(); + if (firstRow.length === 0) { + return []; + } + + const cells = firstRow.children("th, td").toArray(); + const isHeaderRow = + cells.length > 0 && + cells.every((cell) => cell.type === "tag" && cell.tagName === "th"); + + return isHeaderRow ? (firstRow.toArray() as Element[]) : []; + } + /** * Processes the context to convert the sanitized HTML body node to Markdown. * @param context The current processing context. @@ -119,6 +250,8 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { // Only process if we have a Cheerio object (implicitly means it's HTML) try { logger.debug(`Converting HTML content to Markdown for ${context.source}`); + this.preservedTableHtml.clear(); + this.splitOversizedTables($, context.source); // Provide Turndown with the HTML string content from the Cheerio object's body, // or the whole document if body is empty/unavailable. const htmlToConvert = $("body").html() || $.html(); @@ -147,6 +280,8 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { ), ); // Decide if pipeline should stop? For now, continue. + } finally { + this.preservedTableHtml.clear(); } // Call the next middleware in the chain regardless of whether conversion happened