From bab80d41484523aba4ca28c1cc7b40ce788e5b7f Mon Sep 17 00:00:00 2001 From: MQ Date: Tue, 11 Mar 2025 16:27:03 +0100 Subject: [PATCH 1/4] add block media feature for playwright - blocks images, videos, css --- .actor/input_schema.json | 6 ++++++ src/input.ts | 21 +++++++++++++++++++++ src/types.ts | 1 + 3 files changed, 28 insertions(+) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 67dba0f..9e7b2a3 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -145,6 +145,12 @@ "description": "If enabled, the Actor attempts to close or remove cookie consent dialogs to improve the quality of extracted text. Note that this setting increases the latency.", "default": true }, + "blockMedia": { + "title": "Block media resources", + "type": "boolean", + "description": "If enabled, the Actor will block loading of images, videos and CSS resources when using the Playwright browser. This can improve performance and reduce bandwidth usage.", + "default": false + }, "debugMode": { "title": "Enable debug mode", "type": "boolean", diff --git a/src/input.ts b/src/input.ts index d0f9822..a44a2ad 100644 --- a/src/input.ts +++ b/src/input.ts @@ -111,6 +111,27 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration maxConcurrency, minConcurrency, }, + preNavigationHooks: input.blockMedia ? [ + async ({ page }) => { + await page.route('**/*', async (route) => { + const resourceType = route.request().resourceType(); + const url = route.request().url(); + + // Block if it's an image/video/css resource type or has an image/video extension + if ( + resourceType === 'image' + || resourceType === 'video' + || resourceType === 'media' + || resourceType === 'stylesheet' + || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) + ) { + await route.abort(); + } else { + await route.continue(); + } + }); + }, + ] : [], }, }; } diff --git a/src/types.ts b/src/types.ts index 8cd14ab..21c5642 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,6 +32,7 @@ export type Input = { removeElementsCssSelector: string; removeCookieWarnings: boolean; scrapingTool: 'browser-playwright' | 'raw-http'; + blockMedia: boolean; }; export type StandbyInput = Input & { From e9a81764f4d302921ea2bc7444b20867b8df7ac1 Mon Sep 17 00:00:00 2001 From: MQ Date: Wed, 12 Mar 2025 22:13:16 +0100 Subject: [PATCH 2/4] block media by default --- .actor/input_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 9e7b2a3..b529a26 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -149,7 +149,7 @@ "title": "Block media resources", "type": "boolean", "description": "If enabled, the Actor will block loading of images, videos and CSS resources when using the Playwright browser. This can improve performance and reduce bandwidth usage.", - "default": false + "default": true }, "debugMode": { "title": "Enable debug mode", From 5580150b3f9bbefdf746f84b357d563ad2c1c136 Mon Sep 17 00:00:00 2001 From: MQ Date: Wed, 12 Mar 2025 22:15:25 +0100 Subject: [PATCH 3/4] add to consts --- src/const.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/const.ts b/src/const.ts index 8df5514..64f1015 100644 --- a/src/const.ts +++ b/src/const.ts @@ -37,6 +37,7 @@ export const defaults = { query: undefined, // No default value in input_schema.json readableTextCharThreshold: 100, // Not in input_schema.json removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default, + blockMedia: inputSchema.properties.blockMedia.default, removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default, requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default, requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum, From c925b764bf1302992271640952942c7822173da3 Mon Sep 17 00:00:00 2001 From: MQ Date: Thu, 13 Mar 2025 12:40:34 +0100 Subject: [PATCH 4/4] Add media blocking option - README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2c6fa1b..663aefd 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ The extracted text can then be injected into prompts and retrieval augmented gen - 📝 Output formats include **Markdown**, plain text, and HTML - 🔌 Supports **OpenAPI and MCP** for easy integration - 🪟 It's **open source**, so you can review and modify it +- 🖼️ **Media blocking** to skip images, videos, and CSS for faster scraping and lower bandwidth usage ## Example @@ -119,6 +120,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters: | `maxRequestRetries` | number | `1` | The maximum number of times the Actor will retry loading the target web page on error. If the last attempt fails, the page will be skipped in the results. | | `dynamicContentWaitSecs` | number | `10` | The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle. | | `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. | +| `blockMedia` | boolean | `true` | If enabled, blocks loading of images, videos, and CSS when using `browser-playwright`, improving speed and bandwidth. | | `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. | | `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. |