Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,89 @@ describe("HtmlPlaywrightMiddleware", () => {
launchSpy.mockRestore();
});

it("should reset the browser after a Playwright render timeout", async () => {
const initialHtml = "<html><body><p>Test</p></body></html>";
const context = createPipelineTestContext(initialHtml, "https://example.com/test");
const next = vi.fn();

const pageSpy = createMockPlaywrightPage(initialHtml);
pageSpy.waitForSelector = vi
.fn()
.mockRejectedValue(new Error("Timeout 5000ms exceeded"));
const contextSpy = {
newPage: vi.fn().mockResolvedValue(pageSpy),
close: vi.fn().mockResolvedValue(undefined),
};
const browserSpy = {
newContext: vi.fn().mockResolvedValue(contextSpy),
isConnected: vi.fn().mockReturnValue(true),
on: vi.fn(),
close: vi.fn().mockResolvedValue(undefined),
} as unknown as MockedObject<Browser>;
const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy);
const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser");

await playwrightMiddleware.process(context, next);

expect(context.errors).toHaveLength(1);
expect(context.errors[0].message).toContain("Timeout 5000ms exceeded");
expect(pageSpy.close).toHaveBeenCalled();
expect(contextSpy.close).toHaveBeenCalled();
expect(browserSpy.close).toHaveBeenCalled();
expect(closeBrowserSpy).toHaveBeenCalledWith(
"reset after Playwright render failure for https://example.com/test",
);
expect(next).toHaveBeenCalled();

closeBrowserSpy.mockRestore();
launchSpy.mockRestore();
});

it("should bound hung page cleanup and reset the browser", async () => {
vi.useFakeTimers();
try {
const initialHtml = "<html><body><p>Test</p></body></html>";
const context = createPipelineTestContext(
initialHtml,
"https://example.com/test",
);
const next = vi.fn();

const pageSpy = createMockPlaywrightPage(initialHtml);
pageSpy.close = vi.fn().mockImplementation(() => new Promise(() => {}));
const contextSpy = {
newPage: vi.fn().mockResolvedValue(pageSpy),
close: vi.fn().mockResolvedValue(undefined),
};
const browserSpy = {
newContext: vi.fn().mockResolvedValue(contextSpy),
isConnected: vi.fn().mockReturnValue(true),
on: vi.fn(),
close: vi.fn().mockResolvedValue(undefined),
} as unknown as MockedObject<Browser>;
const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy);
const closeBrowserSpy = vi.spyOn(playwrightMiddleware, "closeBrowser");

const processPromise = playwrightMiddleware.process(context, next);
await vi.advanceTimersByTimeAsync(5000);
await expect(processPromise).resolves.toBeUndefined();

expect(context.errors).toHaveLength(0);
expect(pageSpy.close).toHaveBeenCalled();
expect(contextSpy.close).toHaveBeenCalled();
expect(browserSpy.close).toHaveBeenCalled();
expect(closeBrowserSpy).toHaveBeenCalledWith(
"reset after Playwright cleanup failure for https://example.com/test",
);
expect(next).toHaveBeenCalled();

closeBrowserSpy.mockRestore();
launchSpy.mockRestore();
} finally {
vi.useRealTimers();
}
});

it("should skip processing when scrapeMode is not playwright/auto", async () => {
const initialHtml = "<html><body><p>Test</p></body></html>";
const context = createPipelineTestContext(initialHtml, "https://example.com/test", {
Expand Down
163 changes: 149 additions & 14 deletions src/scraper/middleware/HtmlPlaywrightMiddleware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ interface CachedResource {
contentType: string;
}

interface BrowserProcessHandle {
killed: boolean;
pid?: number;
kill(signal?: NodeJS.Signals | number): boolean;
}

interface BrowserWithProcessHandle extends Browser {
process(): BrowserProcessHandle | null;
}

/**
* Middleware to process HTML content using Playwright for rendering dynamic content,
* *if* the scrapeMode option requires it ('playwright' or 'auto').
Expand Down Expand Up @@ -116,27 +126,135 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
return this.browser;
}

private getPageTimeoutMs(): number {
return this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs;
}

private getBrowserTimeoutMs(): number {
return this.config.browserTimeoutMs ?? defaults.scraper.browserTimeoutMs;
}

private async withTimeout<T>(
operation: Promise<T>,
timeoutMs: number,
description: string,
): Promise<T> {
let timeout: NodeJS.Timeout | undefined;
try {
return await Promise.race([
operation,
new Promise<T>((_resolve, reject) => {
timeout = setTimeout(() => {
reject(new Error(`${description} timed out after ${timeoutMs}ms`));
}, timeoutMs);
}),
]);
} finally {
if (timeout) {
clearTimeout(timeout);
}
}
}

private getBrowserProcess(browser: Browser): BrowserProcessHandle | null {
if (!("process" in browser) || typeof browser.process !== "function") {
return null;
}

return (browser as BrowserWithProcessHandle).process();
}

private killBrowserProcess(browser: Browser, reason: string): void {
const browserProcess = this.getBrowserProcess(browser);
if (!browserProcess || browserProcess.killed) {
return;
}

const pid = browserProcess.pid ? ` pid=${browserProcess.pid}` : "";
logger.warn(`⚠️ Force killing Playwright browser process${pid}: ${reason}`);
try {
if (!browserProcess.kill("SIGKILL")) {
logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}`);
}
} catch (error) {
logger.warn(`⚠️ Failed to force kill Playwright browser process${pid}: ${error}`);
}
}
Comment thread
arabold marked this conversation as resolved.

/**
* Closes the Playwright browser instance if it exists.
* Should be called during application shutdown.
* Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
*/
async closeBrowser(): Promise<void> {
async closeBrowser(reason = "shutdown"): Promise<void> {
if (this.browser) {
const browser = this.browser;
try {
logger.debug("Closing Playwright browser instance...");
logger.debug(`Closing Playwright browser instance (${reason})...`);
// Always attempt to close, even if disconnected, to reap zombie processes
await this.browser.close();
await this.withTimeout(
browser.close(),
this.getPageTimeoutMs(),
"Playwright browser close",
);
} catch (error) {
// Log error but don't throw - cleanup should be non-fatal
logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
logger.warn(`⚠️ Error closing Playwright browser (${reason}): ${error}`);
this.killBrowserProcess(browser, reason);
} finally {
// Always set to null to allow fresh browser on next request
this.browser = null;
}
}
}

private async cleanupPageAndContext(
page: Page | null,
browserContext: BrowserContext | null,
source: string,
): Promise<boolean> {
let cleanupSucceeded = true;
const timeoutMs = this.getPageTimeoutMs();

if (page) {
try {
await this.withTimeout(
page.unroute("**/*"),
timeoutMs,
`Playwright route cleanup for ${source}`,
);
} catch (error) {
cleanupSucceeded = false;
logger.warn(`⚠️ Playwright route cleanup failed for ${source}: ${error}`);
}

try {
await this.withTimeout(
page.close(),
timeoutMs,
`Playwright page close for ${source}`,
);
} catch (error) {
cleanupSucceeded = false;
logger.warn(`⚠️ Playwright page cleanup failed for ${source}: ${error}`);
}
}

if (browserContext) {
try {
await this.withTimeout(
browserContext.close(),
timeoutMs,
`Playwright browser context close for ${source}`,
);
} catch (error) {
cleanupSucceeded = false;
logger.warn(`⚠️ Playwright context cleanup failed for ${source}: ${error}`);
}
}

return cleanupSucceeded;
}

/**
* Injects the shadow DOM extractor script into the page.
* This script performs non-invasive extraction that preserves document structure.
Expand Down Expand Up @@ -979,6 +1097,8 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
let page: Page | null = null;
let browserContext: BrowserContext | null = null;
let renderedHtml: string | null = null;
let shouldResetBrowser = false;
let renderFailed = false;

// Extract credentials and origin using helper
const { credentials, origin } = extractCredentialsAndOrigin(context.source);
Expand Down Expand Up @@ -1052,11 +1172,15 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
);
});

// Wait for either body (normal HTML) or frameset (frameset documents) to appear
const pageTimeoutMs = this.getPageTimeoutMs();

// Load initial HTML content
await page.goto(context.source, { waitUntil: "load" });
await page.goto(context.source, {
waitUntil: "load",
timeout: this.getBrowserTimeoutMs(),
});

// Wait for either body (normal HTML) or frameset (frameset documents) to appear
const pageTimeoutMs = this.config.pageTimeoutMs ?? defaults.scraper.pageTimeoutMs;
await page.waitForSelector("body, frameset", {
timeout: pageTimeoutMs,
});
Expand All @@ -1081,20 +1205,31 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware {
`Playwright: Successfully rendered content for ${context.source} using ${method}`,
);
} catch (error) {
renderFailed = true;
shouldResetBrowser = true;
logger.error(`❌ Playwright failed to render ${context.source}: ${error}`);
context.errors.push(
error instanceof Error
? error
: new Error(`Playwright rendering failed: ${String(error)}`),
);
} finally {
// Ensure page/context are closed even if subsequent steps fail
if (page) {
await page.unroute("**/*");
await page.close();
const cleanupSucceeded = await this.cleanupPageAndContext(
page,
browserContext,
context.source,
);
const cleanupFailed = !cleanupSucceeded;
if (cleanupFailed) {
shouldResetBrowser = true;
}
if (browserContext) {
await browserContext.close();
if (shouldResetBrowser) {
const resetReason = renderFailed
? cleanupFailed
? `reset after Playwright render and cleanup failure for ${context.source}`
: `reset after Playwright render failure for ${context.source}`
: `reset after Playwright cleanup failure for ${context.source}`;
await this.closeBrowser(resetReason);
}
}

Expand Down
Loading
Loading