From 13e15015db44022336b21f2ea729d521db6be7e1 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 11:25:50 +0100 Subject: [PATCH 1/8] fix: sort defaults --- src/const.ts | 4 ++-- src/types.ts | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/const.ts b/src/const.ts index 64f1015..e27b8df 100644 --- a/src/const.ts +++ b/src/const.ts @@ -21,6 +21,7 @@ export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60; // Default values parsed from input_schema.json export const defaults = { + blockMedia: inputSchema.properties.blockMedia.default, debugMode: inputSchema.properties.debugMode.default, dynamicContentWaitSecs: inputSchema.properties.dynamicContentWaitSecs.default, htmlTransformer: inputSchema.properties.htmlTransformer.default, @@ -37,12 +38,11 @@ export const defaults = { query: undefined, // No default value in input_schema.json readableTextCharThreshold: 100, // Not in input_schema.json removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default, - blockMedia: inputSchema.properties.blockMedia.default, removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default, requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default, requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum, + scrapingTool: inputSchema.properties.scrapingTool.default, serpMaxRetries: inputSchema.properties.serpMaxRetries.default, serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum, serpProxyGroup: inputSchema.properties.serpProxyGroup.default, - scrapingTool: inputSchema.properties.scrapingTool.default, }; diff --git a/src/types.ts b/src/types.ts index 21c5642..026ff6a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -113,7 +113,9 @@ export type Output = { loadedAt?: Date; requestStatus: string; uniqueKey: string; - debug?: unknown; + debug?: { + timeMeasures?: TimeMeasure[]; + }; }; searchResult: OrganicResult; metadata: { From e70822777c9ceaaa6b2ccb56f9c42f99d61fa6b4 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 16:18:07 +0100 Subject: [PATCH 2/8] fix preNavigationHooks --- src/crawlers.ts | 5 +- src/input.ts | 20 ++++-- src/performance-evaluation.ts | 132 ++++++++++++++++++++++++++++++++++ src/types.ts | 1 + 4 files changed, 151 insertions(+), 7 deletions(-) create mode 100644 src/performance-evaluation.ts diff --git a/src/crawlers.ts b/src/crawlers.ts index eb28d20..683bad5 100644 --- a/src/crawlers.ts +++ b/src/crawlers.ts @@ -23,7 +23,10 @@ const crawlers = new Map(); const client = new MemoryStorage({ persistStorage: false }); export function getCrawlerKey(crawlerOptions: CheerioCrawlerOptions | PlaywrightCrawlerOptions) { - return JSON.stringify(crawlerOptions); + // remove 'log' field from crawler key + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const { log: _log, ...rest } = crawlerOptions; + return JSON.stringify(rest); } /** diff --git a/src/input.ts b/src/input.ts index a44a2ad..f26bfbc 100644 --- a/src/input.ts +++ b/src/input.ts @@ -46,8 +46,10 @@ async function processInputInternal( if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') { originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; } + if (originalInput.blockMedia && typeof originalInput.blockMedia === 'string') { + originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1'; + } const input = { ...defaults, ...originalInput } as Input; - validateAndFillInput(input, standbyInit); const { @@ -90,7 +92,7 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration return { type: ContentCrawlerTypes.PLAYWRIGHT, crawlerOptions: { - headless: true, + headless: false, keepAlive, maxRequestRetries, proxyConfiguration: proxy, @@ -111,27 +113,27 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration maxConcurrency, minConcurrency, }, - preNavigationHooks: input.blockMedia ? [ + preNavigationHooks: [ async ({ page }) => { await page.route('**/*', async (route) => { const resourceType = route.request().resourceType(); const url = route.request().url(); // Block if it's an image/video/css resource type or has an image/video extension - if ( + if (input.blockMedia && ( resourceType === 'image' || resourceType === 'video' || resourceType === 'media' || resourceType === 'stylesheet' || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) - ) { + )) { await route.abort(); } else { await route.continue(); } }); }, - ] : [], + ], }, }; } @@ -207,4 +209,10 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) { if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') { throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.'); } + // handle case when blockMedia is not defined, coerce blockMedia to boolean + if (input.blockMedia === undefined || input.blockMedia === null) { + input.blockMedia = defaults.blockMedia; + } else { + throw new UserInputError('The `blockMedia` parameter must be a boolean or a string.'); + } } diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts new file mode 100644 index 0000000..e2f8a1c --- /dev/null +++ b/src/performance-evaluation.ts @@ -0,0 +1,132 @@ +/** + * Performance evaluation of the RAG Web Browser with respect to different settings. + * This script runs a series of queries and saves performance results into a dataset. + * The results include average time for each time measure event. + * + * The evaluation is performed with different combinations of the following parameters: + * - `scrapingTool`: The tool used for scraping (e.g., `raw-http`, `browser-playwright`). + * - `mediaBlocked`: Whether media content is blocked during scraping (true/false). + * - `maxResults`: The maximum number of results to scrape (e.g., 1, 3). + * + * The script performs the following steps: + * 1. Runs a set of predefined queries using different combinations of parameters. + * 2. Fetches the results and computes the average time for each time measure event. + * 3. Logs the performance results, including average latency for each combination of parameters. + * 4. Aborts the last run of the actor to ensure no resources are wasted. + * + * The results are stored in a table format, showing the average latency for each combination of parameters. + * + * Usage: + * - Ensure the `APIFY_TOKEN` environment variable is set with your Apify API token. + * - Run the script to perform the performance evaluation. + * - The results will be logged to the console. + */ + +import { log } from 'apify'; + +import { Output } from './types'; + +const EVALUATION_QUERIES = [ + 'apify', + 'donald trump', + 'ai agents', +]; + +const apifyToken = process.env.APIFY_TOKEN; + +const user = 'jiri-spilka'; +const actorId = 'apify~rag-web-browser'; +const urlUserActor = `${user}--rag-web-browser-task`; + +const memory = 8; // memory can't be changed in the standby mode +const scrapingToolSet = ['raw-http', 'browser-playwright']; +const mediaBlockedSet = [true, false]; +const maxResultsSet = [1, 3]; + +const url = `https://${urlUserActor}.apify.actor/search`; + +const headers = { + Accept: 'application/json', + Authorization: `Bearer ${apifyToken}`, +}; + +const results = new Map(); +const resultsTable = []; + +for (const scrapingTool of scrapingToolSet) { + for (const blockMedia of mediaBlockedSet) { + for (const maxResults of maxResultsSet) { + log.info(`Running ${EVALUATION_QUERIES.length} query/queries with ${scrapingTool}, ${blockMedia ? 'blocked media' : 'unblocked media'}, maxResults=${maxResults}`); + for (const q of EVALUATION_QUERIES) { + const queryParams = new URLSearchParams({ query: q, scrapingTool, blockMedia: blockMedia.toString(), debugMode: 'true', maxResults: maxResults.toString() }); + const urlWithParams = `${url}?${queryParams.toString()}`; + log.info(`Running ${urlWithParams}`); + const res = await fetch(urlWithParams, { method: 'GET', headers }); + if (!res.ok) { + throw new Error(`Failed to run the actor: ${JSON.stringify(await res.json())}`); + } + const data: Output[] = await res.json(); + log.info(`Received number of results: ${data.length}`); + const k = `${scrapingTool}__${blockMedia ? 'blocked' : 'unblocked'}__${maxResults}`; + if (results.has(k)) { + results.set(k, [...results.get(k)!, ...data]); + } else { + results.set(k, data); + } + } + } + } +} + +log.info(`Get the last run: ${actorId}`); +const response = await fetch(`https://api.apify.com/v2/acts/${actorId}/runs/last`, { headers }); +const resp = await response.json(); +const { id: runId } = resp.data; + +log.info(`Abort run ${runId}`); +const r = await fetch(`https://api.apify.com/v2/actor-runs/${runId}/abort`, { headers }); +log.info(`The last run has been aborted: ${await r.json()}`); + +for (const [key, data] of results) { + const remoteDataset = data; + log.info('Compute average time for each time measure event'); + const timeMeasuresMap = new Map(); + const timeMeasuresTimeTaken = []; + + // compute average time for the timeMeasures + for (const item of remoteDataset) { + const { timeMeasures } = item.crawl.debug ?? {}; + if (!timeMeasures) { + continue; + } + for (const measure of timeMeasures) { + if (!timeMeasuresMap.has(measure.event)) { + timeMeasuresMap.set(measure.event, []); + } + timeMeasuresMap.set(measure.event, [...timeMeasuresMap.get(measure.event)!, measure.timeDeltaPrevMs]); + if (measure.event === 'playwright-before-response-send' || measure.event === 'cheerio-before-response-send') { + timeMeasuresTimeTaken.push(measure.timeMs); + } + } + } + log.info('Average time for each time measure event:', timeMeasuresMap); + + for (const [k, value] of timeMeasuresMap) { + const sum = value.reduce((a, b) => a + b, 0); + const avg = sum / value.length; + log.info(`${k}: ${avg.toFixed(0)} ms`); + } + + log.info('Time taken for each request:', timeMeasuresTimeTaken); + log.info('Time taken on average', { average: timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length }); + + // Store results for the table + const avgLatency = timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length / 1000; + resultsTable.push(`| ${memory} | ${key.split('__')[0]} | ${key.split('__')[1]} | ${key.split('__')[2]} | ${avgLatency.toFixed(1)} |`); +} + +// Print the results table +log.info('\nPerformance Results:'); +log.info('| Memory (GB) | Scraping Tool | Media Blocked | Max Results | Latency (sec) |'); +log.info('|-------------|---------------|---------------|-------------|---------------|'); +resultsTable.forEach((row) => log.info(row)); diff --git a/src/types.ts b/src/types.ts index 026ff6a..f7b1ca6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -37,6 +37,7 @@ export type Input = { export type StandbyInput = Input & { outputFormats: OutputFormats[] | string + blockMedia: boolean | string; } export type OrganicResult = { From 8b38952e0c8bbbc5f325cffb8d32c3b72e3299fa Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 16:29:16 +0100 Subject: [PATCH 3/8] fix: input --- src/input.ts | 5 ----- src/performance-evaluation.ts | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/input.ts b/src/input.ts index f26bfbc..c659b1d 100644 --- a/src/input.ts +++ b/src/input.ts @@ -46,9 +46,6 @@ async function processInputInternal( if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') { originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; } - if (originalInput.blockMedia && typeof originalInput.blockMedia === 'string') { - originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1'; - } const input = { ...defaults, ...originalInput } as Input; validateAndFillInput(input, standbyInit); @@ -212,7 +209,5 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) { // handle case when blockMedia is not defined, coerce blockMedia to boolean if (input.blockMedia === undefined || input.blockMedia === null) { input.blockMedia = defaults.blockMedia; - } else { - throw new UserInputError('The `blockMedia` parameter must be a boolean or a string.'); } } diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts index e2f8a1c..8abbece 100644 --- a/src/performance-evaluation.ts +++ b/src/performance-evaluation.ts @@ -109,6 +109,7 @@ for (const [key, data] of results) { } } } + log.info(`Performance for key: ${key}`); log.info('Average time for each time measure event:', timeMeasuresMap); for (const [k, value] of timeMeasuresMap) { From 9c2ab32d0610ed4786f3a199049742fd36af4707 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 16:45:12 +0100 Subject: [PATCH 4/8] fix: input and headless --- src/crawlers.ts | 5 +---- src/input.ts | 7 ++++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/crawlers.ts b/src/crawlers.ts index 683bad5..eb28d20 100644 --- a/src/crawlers.ts +++ b/src/crawlers.ts @@ -23,10 +23,7 @@ const crawlers = new Map(); const client = new MemoryStorage({ persistStorage: false }); export function getCrawlerKey(crawlerOptions: CheerioCrawlerOptions | PlaywrightCrawlerOptions) { - // remove 'log' field from crawler key - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const { log: _log, ...rest } = crawlerOptions; - return JSON.stringify(rest); + return JSON.stringify(crawlerOptions); } /** diff --git a/src/input.ts b/src/input.ts index c659b1d..ad2e8ae 100644 --- a/src/input.ts +++ b/src/input.ts @@ -46,6 +46,11 @@ async function processInputInternal( if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') { originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; } + + if (typeof originalInput.blockMedia === 'string') { + originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1'; + } + const input = { ...defaults, ...originalInput } as Input; validateAndFillInput(input, standbyInit); @@ -89,7 +94,7 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration return { type: ContentCrawlerTypes.PLAYWRIGHT, crawlerOptions: { - headless: false, + headless: true, keepAlive, maxRequestRetries, proxyConfiguration: proxy, From 52687508eb45c7a3590e9834e94335b5bb46a9b2 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 16:47:38 +0100 Subject: [PATCH 5/8] fix: false positive issue --- src/input.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/input.ts b/src/input.ts index ad2e8ae..fd00965 100644 --- a/src/input.ts +++ b/src/input.ts @@ -47,6 +47,7 @@ async function processInputInternal( originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; } + // noinspection SuspiciousTypeOfGuard if (typeof originalInput.blockMedia === 'string') { originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1'; } From 888a714e624999316667a3953efc31fc9812e470 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 21:19:21 +0100 Subject: [PATCH 6/8] fix: add blocking into a function, pass blockMedia in userData --- src/crawlers.ts | 30 ++++++++++++++++++++++++++++++ src/input.ts | 21 --------------------- src/search.ts | 1 + src/types.ts | 1 + src/utils.ts | 2 ++ 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/crawlers.ts b/src/crawlers.ts index eb28d20..12572d1 100644 --- a/src/crawlers.ts +++ b/src/crawlers.ts @@ -130,6 +130,31 @@ export async function createAndStartContentCrawler( return { key, crawler }; } +/** + * PreNavigation hook that blocks resources based on the blockMedia setting + * from the request's userData. + * Only blocks resources if blockMedia is true. + */ +async function blockMediaResourcesHook({ page, request }: PlaywrightCrawlingContext) { + await page.route('**/*', async (route) => { + const resourceType = route.request().resourceType(); + const url = route.request().url(); + + // Block if it's an image/video/css resource type or has an image/video extension + if (request.userData.blockMedia && ( + resourceType === 'image' + || resourceType === 'video' + || resourceType === 'media' + || resourceType === 'stylesheet' + || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) + )) { + await route.abort(); + } else { + await route.continue(); + } + }); +} + async function createPlaywrightContentCrawler( crawlerOptions: PlaywrightCrawlerOptions, key: string, @@ -143,6 +168,11 @@ async function createPlaywrightContentCrawler( await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext); }, failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.PLAYWRIGHT), + preNavigationHooks: [ + async (context) => { + await blockMediaResourcesHook(context as unknown as PlaywrightCrawlingContext); + }, + ], }); } diff --git a/src/input.ts b/src/input.ts index fd00965..15e94e3 100644 --- a/src/input.ts +++ b/src/input.ts @@ -116,27 +116,6 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration maxConcurrency, minConcurrency, }, - preNavigationHooks: [ - async ({ page }) => { - await page.route('**/*', async (route) => { - const resourceType = route.request().resourceType(); - const url = route.request().url(); - - // Block if it's an image/video/css resource type or has an image/video extension - if (input.blockMedia && ( - resourceType === 'image' - || resourceType === 'video' - || resourceType === 'media' - || resourceType === 'stylesheet' - || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) - )) { - await route.abort(); - } else { - await route.continue(); - } - }); - }, - ], }, }; } diff --git a/src/search.ts b/src/search.ts index 306f7f4..25b4373 100644 --- a/src/search.ts +++ b/src/search.ts @@ -39,6 +39,7 @@ function prepareRequest( responseId, contentScraperSettings, null, + input.blockMedia, ) : createSearchRequest( query, diff --git a/src/types.ts b/src/types.ts index f7b1ca6..9133594 100644 --- a/src/types.ts +++ b/src/types.ts @@ -89,6 +89,7 @@ export type ContentCrawlerUserData = { searchResult?: OrganicResult; contentCrawlerKey?: string; contentScraperSettings: ContentScraperSettings; + blockMedia?: boolean; }; export interface ContentScraperSettings { diff --git a/src/utils.ts b/src/utils.ts index 4028e28..f777cd6 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -80,6 +80,7 @@ export function createRequest( responseId: string, contentScraperSettings: ContentScraperSettings, timeMeasures: TimeMeasure[] | null = null, + blockMedia: boolean = false, ): RequestOptions { return { url: result.url!, @@ -90,6 +91,7 @@ export function createRequest( searchResult: result.url && result.title ? result : undefined, timeMeasures: timeMeasures ? [...timeMeasures] : [], contentScraperSettings, + blockMedia, }, }; } From 78934b99506fcbac5ba88f8cbddd6c41c6515cf4 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 21:56:51 +0100 Subject: [PATCH 7/8] fix: add blocking into a function, pass blockMedia in userData --- src/crawlers.ts | 1 + src/performance-evaluation.ts | 41 +++++++++++++++++++++-------------- src/search.ts | 1 + src/types.ts | 1 + src/utils.ts | 2 ++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/crawlers.ts b/src/crawlers.ts index 12572d1..5689c16 100644 --- a/src/crawlers.ts +++ b/src/crawlers.ts @@ -75,6 +75,7 @@ export async function createAndStartSearchCrawler( responseId, request.userData.contentScraperSettings!, request.userData.timeMeasures!, + request.userData.blockMedia, ); await addContentCrawlRequest(r, responseId, request.userData.contentCrawlerKey!); } diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts index 8abbece..850ca02 100644 --- a/src/performance-evaluation.ts +++ b/src/performance-evaluation.ts @@ -56,10 +56,18 @@ const resultsTable = []; for (const scrapingTool of scrapingToolSet) { for (const blockMedia of mediaBlockedSet) { for (const maxResults of maxResultsSet) { - log.info(`Running ${EVALUATION_QUERIES.length} query/queries with ${scrapingTool}, ${blockMedia ? 'blocked media' : 'unblocked media'}, maxResults=${maxResults}`); + log.info(`Running ${EVALUATION_QUERIES.length} query/queries with ${scrapingTool}, mediaBlocked=${blockMedia}, maxResults=${maxResults}`); + log.info('Start in standby mode'); + const r1 = await fetch(url, { method: 'GET', headers }); + if (!r1.ok) { + throw new Error(`Failed to run the actor: ${JSON.stringify(await r1.json())}`); + } else { + // sleep for 10 seconds to let the actor start + await new Promise((resolve) => setTimeout(resolve, 10000)); + } for (const q of EVALUATION_QUERIES) { const queryParams = new URLSearchParams({ query: q, scrapingTool, blockMedia: blockMedia.toString(), debugMode: 'true', maxResults: maxResults.toString() }); - const urlWithParams = `${url}?${queryParams.toString()}`; + const urlWithParams = `${url}/search?${queryParams.toString()}`; log.info(`Running ${urlWithParams}`); const res = await fetch(urlWithParams, { method: 'GET', headers }); if (!res.ok) { @@ -67,26 +75,26 @@ for (const scrapingTool of scrapingToolSet) { } const data: Output[] = await res.json(); log.info(`Received number of results: ${data.length}`); - const k = `${scrapingTool}__${blockMedia ? 'blocked' : 'unblocked'}__${maxResults}`; + const k = `${scrapingTool}__${blockMedia ? 'blocked' : 'allowed'}__${maxResults}`; if (results.has(k)) { results.set(k, [...results.get(k)!, ...data]); } else { results.set(k, data); } } + log.info(`Get the last run: ${actorId}`); + const response = await fetch(`https://api.apify.com/v2/acts/${actorId}/runs/last`, { headers }); + const resp = await response.json(); + const { id: runId } = resp.data; + + // it is better to abort run not to mix results and involve autoscaling into the mix + log.info(`Abort run ${runId}`); + const r = await fetch(`https://api.apify.com/v2/actor-runs/${runId}/abort`, { method: 'POST', headers }); + log.info(`The last run has been aborted status=${r.status}`); } } } -log.info(`Get the last run: ${actorId}`); -const response = await fetch(`https://api.apify.com/v2/acts/${actorId}/runs/last`, { headers }); -const resp = await response.json(); -const { id: runId } = resp.data; - -log.info(`Abort run ${runId}`); -const r = await fetch(`https://api.apify.com/v2/actor-runs/${runId}/abort`, { headers }); -log.info(`The last run has been aborted: ${await r.json()}`); - for (const [key, data] of results) { const remoteDataset = data; log.info('Compute average time for each time measure event'); @@ -118,16 +126,17 @@ for (const [key, data] of results) { log.info(`${k}: ${avg.toFixed(0)} ms`); } + const avgLatency = timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length / 1000; log.info('Time taken for each request:', timeMeasuresTimeTaken); - log.info('Time taken on average', { average: timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length }); + log.info('Time taken on average', { average: avgLatency.toFixed(1) }); // Store results for the table - const avgLatency = timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length / 1000; - resultsTable.push(`| ${memory} | ${key.split('__')[0]} | ${key.split('__')[1]} | ${key.split('__')[2]} | ${avgLatency.toFixed(1)} |`); + const [scrapingTool, mediaBlocked, maxResults] = key.split('__'); + resultsTable.push(`| ${memory} | ${scrapingTool} | ${mediaBlocked} | ${maxResults} | ${avgLatency.toFixed(1)} |`); } // Print the results table log.info('\nPerformance Results:'); -log.info('| Memory (GB) | Scraping Tool | Media Blocked | Max Results | Latency (sec) |'); +log.info('| Memory (GB) | Scraping Tool | Media | Max Results | Latency (sec) |'); log.info('|-------------|---------------|---------------|-------------|---------------|'); resultsTable.forEach((row) => log.info(row)); diff --git a/src/search.ts b/src/search.ts index 25b4373..435765e 100644 --- a/src/search.ts +++ b/src/search.ts @@ -48,6 +48,7 @@ function prepareRequest( contentCrawlerKey, searchCrawlerOptions.proxyConfiguration, contentScraperSettings, + input.blockMedia, ); addTimeMeasureEvent(req.userData!, 'request-received', Date.now()); diff --git a/src/types.ts b/src/types.ts index 9133594..5c4d8ab 100644 --- a/src/types.ts +++ b/src/types.ts @@ -80,6 +80,7 @@ export type SearchCrawlerUserData = { contentCrawlerKey: string; responseId: string; contentScraperSettings: ContentScraperSettings; + blockMedia: boolean; }; export type ContentCrawlerUserData = { diff --git a/src/utils.ts b/src/utils.ts index f777cd6..c0106c5 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -48,6 +48,7 @@ export function createSearchRequest( contentCrawlerKey: string, proxyConfiguration: ProxyConfiguration | undefined, contentScraperSettings: ContentScraperSettings, + blockMedia: boolean, ): RequestOptions { // add some overhead for the maxResults to account for the fact that some results are not Organic const n = Number(maxResults) + 5; @@ -67,6 +68,7 @@ export function createSearchRequest( contentCrawlerKey, contentScraperSettings, responseId, + blockMedia, }, }; } From 4e88619deede6f5a4290181fcdb66fffb0dac43d Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 18 Mar 2025 22:40:58 +0100 Subject: [PATCH 8/8] fix: update README.md --- README.md | 24 ++++++++++------ src/performance-evaluation.ts | 2 +- src/performance-measures.ts | 52 ----------------------------------- 3 files changed, 17 insertions(+), 61 deletions(-) delete mode 100644 src/performance-measures.ts diff --git a/README.md b/README.md index 663aefd..039a801 100644 --- a/README.md +++ b/README.md @@ -281,23 +281,31 @@ increase `requestTimeoutSecs` accordingly. Below is a typical latency breakdown for RAG Web Browser with **maxResults** set to either `1` or `3`, and various memory settings. These settings allow for processing all search results in parallel. -The numbers below are based on the following search terms: "apify", "Donald Trump", "boston". +The numbers below are based on the following search terms: "apify", "Donald Trump", "AI Agents". Results were averaged for the three queries. -| Memory (GB) | Max results | Latency (sec) | -|-------------|-------------|---------------| -| 4 | 1 | 22 | -| 4 | 3 | 31 | -| 8 | 1 | 16 | -| 8 | 3 | 17 | +| Memory (GB) | Scraping Tool | Max Results | Latency (sec) | +|-------------|--------------------|-------------|---------------| +| 8 | raw-http | 1 | 3.4 | +| 8 | browser-playwright | 1 | 8.9 | +| 8 | raw-http | 3 | 5.4 | +| 8 | browser-playwright | 3 | 13.6 | -Please note the these results are only indicative and may vary based on the search term, target websites, and network latency. +| Memory (GB) | Scraping Tool | Max Results | Latency (sec) | +|-------------|--------------------|-------------|---------------| +| 4 | raw-http | 1 | 4.1 | +| 4 | raw-http | 3 | 4.8 | +| 4 | browser-playwright | 1 | 16.5 | +| 4 | browser-playwright | 3 | 20.6 | + +Please note that these results are only indicative and may vary based on the search term, target websites, and network latency. ## 💰 Pricing The RAG Web Browser is free of charge, and you only pay for the Apify platform consumption when it runs. The main driver of the price is the Actor compute units (CUs), which are proportional to the amount of Actor run memory and run time (1 CU = 1 GB memory x 1 hour). +Another thing to consider is proxy traffic; residential proxies are more expensive than datacenter proxies. ## ⓘ Limitations and feedback diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts index 850ca02..df432da 100644 --- a/src/performance-evaluation.ts +++ b/src/performance-evaluation.ts @@ -43,7 +43,7 @@ const scrapingToolSet = ['raw-http', 'browser-playwright']; const mediaBlockedSet = [true, false]; const maxResultsSet = [1, 3]; -const url = `https://${urlUserActor}.apify.actor/search`; +const url = `https://${urlUserActor}.apify.actor`; const headers = { Accept: 'application/json', diff --git a/src/performance-measures.ts b/src/performance-measures.ts deleted file mode 100644 index 77fa360..0000000 --- a/src/performance-measures.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { Actor } from 'apify'; - -/** - * Compute average time for each time measure event - */ - -// const datasetId = 'aDnsnaBqGb8eTdpGv'; // 2GB, maxResults=1 -// const datasetId = 'giAPLL8dhd2PDqPlf'; // 2GB, maxResults=5 -// const datasetId = 'VKzel6raVqisgIYfe'; // 4GB, maxResults=1 -// const datasetId = 'KkTaLd70HbFgAO35y'; // 4GB, maxResults=3 -// const datasetId = 'fm9tO0GDBUagMT0df'; // 4GB, maxResults=5 -// const datasetId = '6ObH057Icr9z1bgXl'; // 8GB, maxResults=1 -const datasetId = 'lfItikr0vAXv7oXwH'; // 8GB, maxResults=3 - -// set environment variables APIFY_TOKEN -process.env.APIFY_TOKEN = ''; - -const dataset = await Actor.openDataset(datasetId, { forceCloud: true }); -const remoteDataset = await dataset.getData(); - -const timeMeasuresMap = new Map(); -const timeMeasuresTimeTaken = []; - -// compute average time for the timeMeasures -for (const item of remoteDataset.items) { - const { timeMeasures } = item.crawl.debug; - - for (const measure of timeMeasures) { - if (!timeMeasuresMap.has(measure.event)) { - timeMeasuresMap.set(measure.event, []); - } - timeMeasuresMap.set(measure.event, [...timeMeasuresMap.get(measure.event)!, measure.timeDeltaPrevMs]); - - if (measure.event === 'playwright-before-response-send') { - timeMeasuresTimeTaken.push(measure.timeMs); - } - } -} -// eslint-disable-next-line no-console -console.log('Average time for each time measure event:', timeMeasuresMap); - -for (const [key, value] of timeMeasuresMap) { - const sum = value.reduce((a, b) => a + b, 0); - const avg = sum / value.length; - // eslint-disable-next-line no-console - console.log(`${key}: ${avg.toFixed(0)} s`); -} - -// eslint-disable-next-line no-console -console.log('Time taken for each request:', timeMeasuresTimeTaken); -// eslint-disable-next-line no-console -console.log('Time taken on average', timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length);