diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts
index 3df8de02..f4609619 100644
--- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts
+++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts
@@ -251,6 +251,89 @@ describe("HtmlPlaywrightMiddleware", () => {
launchSpy.mockRestore();
});
+ it("should reset the browser after a Playwright render timeout", async () => {
+ const initialHtml = "
Test
";
+ const context = createPipelineTestContext(initialHtml, "https://example.com/test");
+ const next = vi.fn();
+
+ const pageSpy = createMockPlaywrightPage(initialHtml);
+ pageSpy.waitForSelector = vi
+ .fn()
+ .mockRejectedValue(new Error("Timeout 5000ms exceeded"));
+ const contextSpy = {
+ newPage: vi.fn().mockResolvedValue(pageSpy),
+ close: vi.fn().mockResolvedValue(undefined),
+ };
+ const browserSpy = {
+ newContext: vi.fn().mockResolvedValue(contextSpy),
+ isConnected: vi.fn().mockReturnValue(true),
+ on: vi.fn(),
+ close: vi.fn().mockResolvedValue(undefined),
+ } as unknown as MockedObject;
+ const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy);
+ const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser");
+
+ await playwrightMiddleware.process(context, next);
+
+ expect(context.errors).toHaveLength(1);
+ expect(context.errors[0].message).toContain("Timeout 5000ms exceeded");
+ expect(pageSpy.close).toHaveBeenCalled();
+ expect(contextSpy.close).toHaveBeenCalled();
+ expect(browserSpy.close).toHaveBeenCalled();
+ expect(closeBrowserSpy).toHaveBeenCalledWith(
+ "reset after Playwright render failure for https://example.com/test",
+ );
+ expect(next).toHaveBeenCalled();
+
+ closeBrowserSpy.mockRestore();
+ launchSpy.mockRestore();
+ });
+
+ it("should bound hung page cleanup and reset the browser", async () => {
+ vi.useFakeTimers();
+ try {
+ const initialHtml = "Test
";
+ const context = createPipelineTestContext(
+ initialHtml,
+ "https://example.com/test",
+ );
+ const next = vi.fn();
+
+ const pageSpy = createMockPlaywrightPage(initialHtml);
+ pageSpy.close = vi.fn().mockImplementation(() => new Promise(() => {}));
+ const contextSpy = {
+ newPage: vi.fn().mockResolvedValue(pageSpy),
+ close: vi.fn().mockResolvedValue(undefined),
+ };
+ const browserSpy = {
+ newContext: vi.fn().mockResolvedValue(contextSpy),
+ isConnected: vi.fn().mockReturnValue(true),
+ on: vi.fn(),
+ close: vi.fn().mockResolvedValue(undefined),
+ } as unknown as MockedObject;
+ const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy);
+ const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser");
+
+ const processPromise = playwrightMiddleware.process(context, next);
+ await vi.advanceTimersByTimeAsync(5000);
+ await expect(processPromise).resolves.toBeUndefined();
+
+ expect(context.errors).toHaveLength(0);
+ expect(pageSpy.close).toHaveBeenCalled();
+ expect(contextSpy.close).toHaveBeenCalled();
+ expect(browserSpy.close).toHaveBeenCalled();
+ expect(closeBrowserSpy).toHaveBeenCalledWith(
+ "reset after Playwright cleanup failure for https://example.com/test",
+ );
+ expect(next).toHaveBeenCalled();
+
+ closeBrowserSpy.mockRestore();
+ launchSpy.mockRestore();
+ } finally {
+ vi.useRealTimers();
+ }
+ });
+
it("should skip processing when scrapeMode is not playwright/auto", async () => {
const initialHtml = "Test
";
const context = createPipelineTestContext(initialHtml, "https://example.com/test", {
diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts
index 4b35fed0..a241e7bc 100644
--- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts
+++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts
@@ -40,6 +40,16 @@ interface CachedResource {
contentType: string;
}
+interface BrowserProcessHandle {
+ killed: boolean;
+ pid?: number;
+ kill(signal?: NodeJS.Signals | number): boolean;
+}
+
+interface BrowserWithProcessHandle extends Browser {
+ process(): BrowserProcessHandle | null;
+}
+
/**
* Middleware to process HTML content using Playwright for rendering dynamic content,
* *if* the scrapeMode option requires it ('playwright' or 'auto').
@@ -116,20 +126,80 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
return this.browser;
}
+ private getPageTimeoutMs(): number {
+ return this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs;
+ }
+
+ private getBrowserTimeoutMs(): number {
+ return this.config.browserTimeoutMs ?? defaults.scraper.browserTimeoutMs;
+ }
+
+ private async withTimeout(
+ operation: Promise,
+ timeoutMs: number,
+ description: string,
+ ): Promise {
+ let timeout: NodeJS.Timeout | undefined;
+ try {
+ return await Promise.race([
+ operation,
+ new Promise((_resolve, reject) => {
+ timeout = setTimeout(() => {
+ reject(new Error(`${description} timed out after ${timeoutMs}ms`));
+ }, timeoutMs);
+ }),
+ ]);
+ } finally {
+ if (timeout) {
+ clearTimeout(timeout);
+ }
+ }
+ }
+
+ private getBrowserProcess(browser: Browser): BrowserProcessHandle | null {
+ if (!("process" in browser) || typeof browser.process !== "function") {
+ return null;
+ }
+
+ return (browser as BrowserWithProcessHandle).process();
+ }
+
+ private killBrowserProcess(browser: Browser, reason: string): void {
+ const browserProcess = this.getBrowserProcess(browser);
+ if (!browserProcess || browserProcess.killed) {
+ return;
+ }
+
+ const pid = browserProcess.pid ? ` pid=${browserProcess.pid}` : "";
+ logger.warn(`⚠️ Force killing Playwright browser process${pid}: ${reason}`);
+ try {
+ if (!browserProcess.kill("SIGKILL")) {
+ logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}`);
+ }
+ } catch (error) {
+ logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}: ${error}`);
+ }
+ }
+
/**
* Closes the Playwright browser instance if it exists.
* Should be called during application shutdown.
* Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
*/
- async closeBrowser(): Promise {
+ async closeBrowser(reason = "shutdown"): Promise {
if (this.browser) {
+ const browser = this.browser;
try {
- logger.debug("Closing Playwright browser instance...");
+ logger.debug(`Closing Playwright browser instance (${reason})...`);
// Always attempt to close, even if disconnected, to reap zombie processes
- await this.browser.close();
+ await this.withTimeout(
+ browser.close(),
+ this.getPageTimeoutMs(),
+ "Playwright browser close",
+ );
} catch (error) {
- // Log error but don't throw - cleanup should be non-fatal
- logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
+ logger.warn(`⚠️ Error closing Playwright browser (${reason}): ${error}`);
+ this.killBrowserProcess(browser, reason);
} finally {
// Always set to null to allow fresh browser on next request
this.browser = null;
@@ -137,6 +207,54 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
}
}
+ private async cleanupPageAndContext(
+ page: Page | null,
+ browserContext: BrowserContext | null,
+ source: string,
+ ): Promise {
+ let cleanupSucceeded = true;
+ const timeoutMs = this.getPageTimeoutMs();
+
+ if (page) {
+ try {
+ await this.withTimeout(
+ page.unroute("**/*"),
+ timeoutMs,
+ `Playwright route cleanup for ${source}`,
+ );
+ } catch (error) {
+ cleanupSucceeded = false;
+ logger.warn(`⚠️ Playwright route cleanup failed for ${source}: ${error}`);
+ }
+
+ try {
+ await this.withTimeout(
+ page.close(),
+ timeoutMs,
+ `Playwright page close for ${source}`,
+ );
+ } catch (error) {
+ cleanupSucceeded = false;
+ logger.warn(`⚠️ Playwright page cleanup failed for ${source}: ${error}`);
+ }
+ }
+
+ if (browserContext) {
+ try {
+ await this.withTimeout(
+ browserContext.close(),
+ timeoutMs,
+ `Playwright browser context close for ${source}`,
+ );
+ } catch (error) {
+ cleanupSucceeded = false;
+ logger.warn(`⚠️ Playwright context cleanup failed for ${source}: ${error}`);
+ }
+ }
+
+ return cleanupSucceeded;
+ }
+
/**
* Injects the shadow DOM extractor script into the page.
* This script performs non-invasive extraction that preserves document structure.
@@ -979,6 +1097,8 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
let page: Page | null = null;
let browserContext: BrowserContext | null = null;
let renderedHtml: string | null = null;
+ let shouldResetBrowser = false;
+ let renderFailed = false;
// Extract credentials and origin using helper
const { credentials, origin } = extractCredentialsAndOrigin(context.source);
@@ -1052,11 +1172,15 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
);
});
+ // Wait for either body (normal HTML) or frameset (frameset documents) to appear
+ const pageTimeoutMs = this.getPageTimeoutMs();
+
// Load initial HTML content
- await page.goto(context.source, { waitUntil: "load" });
+ await page.goto(context.source, {
+ waitUntil: "load",
+ timeout: this.getBrowserTimeoutMs(),
+ });
- // Wait for either body (normal HTML) or frameset (frameset documents) to appear
- const pageTimeoutMs = this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs;
await page.waitForSelector("body, frameset", {
timeout: pageTimeoutMs,
});
@@ -1081,6 +1205,8 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
`Playwright: Successfully rendered content for ${context.source} using ${method}`,
);
} catch (error) {
+ renderFailed = true;
+ shouldResetBrowser = true;
logger.error(`❌ Playwright failed to render ${context.source}: ${error}`);
context.errors.push(
error instanceof Error
@@ -1088,13 +1214,22 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
: new Error(`Playwright rendering failed: ${String(error)}`),
);
} finally {
- // Ensure page/context are closed even if subsequent steps fail
- if (page) {
- await page.unroute("**/*");
- await page.close();
+ const cleanupSucceeded = await this.cleanupPageAndContext(
+ page,
+ browserContext,
+ context.source,
+ );
+ const cleanupFailed = !cleanupSucceeded;
+ if (cleanupFailed) {
+ shouldResetBrowser = true;
}
- if (browserContext) {
- await browserContext.close();
+ if (shouldResetBrowser) {
+ const resetReason = renderFailed
+ ? cleanupFailed
+ ? `reset after Playwright render and cleanup failure for ${context.source}`
+ : `reset after Playwright render failure for ${context.source}`
+ : `reset after Playwright cleanup failure for ${context.source}`;
+ await this.closeBrowser(resetReason);
}
}
diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts
index 176e4948..cde6c154 100644
--- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts
+++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts
@@ -127,6 +127,128 @@ describe("HtmlToMarkdownMiddleware", () => {
// No close needed
});
+ it("should split oversized tables before GFM conversion while retaining content", async () => {
+ const middleware = new HtmlToMarkdownMiddleware();
+ const rows = Array.from(
+ { length: 505 },
+ (_, index) =>
+ `| Row ${index + 1} | Value ${index + 1} |
`,
+ ).join("");
+ const html = `
+
+
+ `;
+ const context = createMockContext(html);
+ const next = vi.fn().mockResolvedValue(undefined);
+
+ await middleware.process(context, next);
+
+ expect(next).toHaveBeenCalledOnce();
+ expect(context.errors).toHaveLength(0);
+ expect(context.content).toContain("| Row 1 | **Value 1** |");
+ expect(context.content).toContain("| Row 505 | **Value 505** |");
+ expect(context.content.match(/\| Name \| Value \|/g)).toHaveLength(6);
+ });
+
+ it("should preserve captions and colgroups when splitting oversized tables", async () => {
+ const middleware = new HtmlToMarkdownMiddleware();
+ const rows = Array.from(
+ { length: 505 },
+ (_, index) => `| Row ${index + 1} | Value ${index + 1} |
`,
+ ).join("");
+ const html = `
+
+
+ Release matrix
+
+ | Name | Value |
+ ${rows}
+
+ `;
+ const context = createMockContext(html);
+ const next = vi.fn().mockResolvedValue(undefined);
+ const originalTurndown = TurndownService.prototype.turndown;
+ let convertedHtml = "";
+ const turndownSpy = vi
+ .spyOn(TurndownService.prototype, "turndown")
+ .mockImplementation(function (
+ this: TurndownService,
+ input: Parameters[0],
+ ) {
+ convertedHtml = String(input);
+ return originalTurndown.call(this, String(input));
+ });
+
+ try {
+ await middleware.process(context, next);
+
+ expect(next).toHaveBeenCalledOnce();
+ expect(context.errors).toHaveLength(0);
+ expect(convertedHtml.match(/Release matrix<\/caption>/g)).toHaveLength(6);
+ expect(convertedHtml.match(/<\/colgroup>/g)).toHaveLength(
+ 6,
+ );
+ expect(convertedHtml).toContain(
+ 'Release matrix',
+ );
+ expect(context.content.match(/Release matrix/g)).toHaveLength(6);
+ expect(context.content).toContain("| Row 505 | Value 505 |");
+ } finally {
+ turndownSpy.mockRestore();
+ }
+ });
+
+ it("should preserve oversized tables as HTML when row splitting is not possible", async () => {
+ const middleware = new HtmlToMarkdownMiddleware();
+ const headerCells = Array.from(
+ { length: 1001 },
+ (_, index) => `| Header ${index + 1} | `,
+ ).join("");
+ const dataCells = Array.from(
+ { length: 1001 },
+ (_, index) => `Value ${index + 1} | `,
+ ).join("");
+ const html = `
+
+
+ ${headerCells}
+ ${dataCells}
+
+ `;
+ const context = createMockContext(html);
+ const next = vi.fn().mockResolvedValue(undefined);
+ const originalTurndown = TurndownService.prototype.turndown;
+ let convertedHtml = "";
+ const turndownSpy = vi
+ .spyOn(TurndownService.prototype, "turndown")
+ .mockImplementation(function (
+ this: TurndownService,
+ input: Parameters[0],
+ ) {
+ convertedHtml = String(input);
+ return originalTurndown.call(this, String(input));
+ });
+
+ try {
+ await middleware.process(context, next);
+
+ expect(next).toHaveBeenCalledOnce();
+ expect(context.errors).toHaveLength(0);
+ expect(convertedHtml).toContain("data-docs-mcp-preserved-table-id");
+ expect(convertedHtml).not.toContain("Header 1 | ");
+ expect(context.content).toContain("");
+ expect(context.content).toContain("| Header 1 | ");
+ expect(context.content).toContain("Value 1001 | ");
+ expect(context.content).not.toContain("data-docs-mcp-preserved-table-id");
+ expect(context.content).not.toContain("| Header 1 |");
+ } finally {
+ turndownSpy.mockRestore();
+ }
+ });
+
it("should return empty string and markdown type if conversion results in empty markdown", async () => {
const middleware = new HtmlToMarkdownMiddleware();
// HTML that results in empty markdown (only comments)
diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts
index cbcf76d3..a913fe0f 100644
--- a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts
+++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts
@@ -1,16 +1,26 @@
// @ts-expect-error
import { gfm } from "@joplin/turndown-plugin-gfm";
+import type * as cheerio from "cheerio";
+import type { Element } from "domhandler";
import TurndownService from "turndown";
import { logger } from "../../utils/logger";
import { fullTrim } from "../../utils/string";
import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
+const maxGfmTableRows = 500;
+const maxGfmTableCells = 1_000;
+const maxGfmTableHtmlLength = 100_000;
+const gfmTableChunkRows = 100;
+const preservedTablePlaceholderAttribute = "data-docs-mcp-preserved-table-id";
+
/**
* Middleware to convert the final processed HTML content (from Cheerio object in context.dom)
* into Markdown using Turndown, applying custom rules.
*/
export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware {
private turndownService: TurndownService;
+ private readonly preservedTableHtml = new Map();
+ private preservedTableSequence = 0;
constructor() {
this.turndownService = new TurndownService({
@@ -94,12 +104,133 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware {
return `[${normalizedContent}](${href})`; // Standard link conversion
},
});
+ this.turndownService.addRule("preservedOversizedTable", {
+ filter: (node: Node) => {
+ const element = node as HTMLElement;
+ return (
+ element.nodeName === "DIV" &&
+ element.hasAttribute(preservedTablePlaceholderAttribute)
+ );
+ },
+ replacement: (_content: string, node: Node) => {
+ const element = node as HTMLElement;
+ const tableId = element.getAttribute(preservedTablePlaceholderAttribute);
+ const tableHtml = tableId ? this.preservedTableHtml.get(tableId) : undefined;
+ return tableHtml ? `\n\n${tableHtml}\n\n` : "";
+ },
+ });
}
private normalizeLinkContent(content: string): string {
return fullTrim(content).replace(/[ \t]*[\r\n]+[ \t]*/g, " ");
}
+ private splitOversizedTables($: cheerio.CheerioAPI, source: string): void {
+ $("table").each((_index, element) => {
+ const $table = $(element);
+ const rowCount = $table.find("tr").length;
+ const cellCount = $table.find("td, th").length;
+ const htmlLength = $.html(element).length;
+
+ if (
+ rowCount <= maxGfmTableRows &&
+ cellCount <= maxGfmTableCells &&
+ htmlLength <= maxGfmTableHtmlLength
+ ) {
+ return;
+ }
+
+ const replacement = this.buildSplitTableHtml($, $table);
+ if (!replacement) {
+ const tableId = `table-${++this.preservedTableSequence}`;
+ this.preservedTableHtml.set(tableId, $.html(element));
+ logger.warn(
+ `⚠️ Preserving oversized HTML table for ${source} as HTML before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`,
+ );
+ $table.replaceWith(
+ `preserved table
`,
+ );
+ return;
+ }
+
+ logger.warn(
+ `⚠️ Splitting oversized HTML table for ${source} before GFM conversion: ${rowCount} rows, ${cellCount} cells, ${htmlLength} chars`,
+ );
+ $table.replaceWith(replacement);
+ });
+ }
+
+ private buildSplitTableHtml(
+ $: cheerio.CheerioAPI,
+ $table: cheerio.Cheerio,
+ ): string | null {
+ const headerRows = this.getHeaderRows($table);
+ const headerRowSet = new Set(headerRows);
+ const dataRows = $table
+ .find("tr")
+ .toArray()
+ .filter((row): row is Element => row.type === "tag" && !headerRowSet.has(row));
+
+ if (dataRows.length <= gfmTableChunkRows) {
+ return null;
+ }
+
+ const chunks: string[] = [];
+ const headerHtml = headerRows.map((row) => $.html(row)).join("");
+ const preservedChildHtml = this.getPreservedTableChildHtml($, $table);
+
+ for (let index = 0; index < dataRows.length; index += gfmTableChunkRows) {
+ const rowsHtml = dataRows
+ .slice(index, index + gfmTableChunkRows)
+ .map((row) => $.html(row))
+ .join("");
+ const tableParts = ["", preservedChildHtml];
+ if (headerHtml) {
+ tableParts.push("", headerHtml, "");
+ }
+ tableParts.push("", rowsHtml, "
");
+ chunks.push(tableParts.join(""));
+ }
+
+ return chunks.join("\n");
+ }
+
+ private getPreservedTableChildHtml(
+ $: cheerio.CheerioAPI,
+ $table: cheerio.Cheerio,
+ ): string {
+ return $table
+ .children("caption, colgroup")
+ .toArray()
+ .filter((child): child is Element => child.type === "tag")
+ .map((child) => $.html(child))
+ .join("");
+ }
+
+ private getHeaderRows($table: cheerio.Cheerio): Element[] {
+ const theadRows = $table
+ .children("thead")
+ .find("tr")
+ .toArray()
+ .filter((row): row is Element => row.type === "tag");
+
+ if (theadRows.length > 0) {
+ return theadRows;
+ }
+
+ const firstRow = $table.find("tr").first();
+ if (firstRow.length === 0) {
+ return [];
+ }
+
+ const cells = firstRow.children("th, td").toArray();
+ const isHeaderRow =
+ cells.length > 0 &&
+ cells.every((cell) => cell.type === "tag" && cell.tagName === "th");
+
+ return isHeaderRow ? (firstRow.toArray() as Element[]) : [];
+ }
+
/**
* Processes the context to convert the sanitized HTML body node to Markdown.
* @param context The current processing context.
@@ -119,6 +250,8 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware {
// Only process if we have a Cheerio object (implicitly means it's HTML)
try {
logger.debug(`Converting HTML content to Markdown for ${context.source}`);
+ this.preservedTableHtml.clear();
+ this.splitOversizedTables($, context.source);
// Provide Turndown with the HTML string content from the Cheerio object's body,
// or the whole document if body is empty/unavailable.
const htmlToConvert = $("body").html() || $.html();
@@ -147,6 +280,8 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware {
),
);
// Decide if pipeline should stop? For now, continue.
+ } finally {
+ this.preservedTableHtml.clear();
}
// Call the next middleware in the chain regardless of whether conversion happened