diff --git a/README.md b/README.md index 663aefd..039a801 100644 --- a/README.md +++ b/README.md @@ -281,23 +281,31 @@ increase `requestTimeoutSecs` accordingly. Below is a typical latency breakdown for RAG Web Browser with **maxResults** set to either `1` or `3`, and various memory settings. These settings allow for processing all search results in parallel. -The numbers below are based on the following search terms: "apify", "Donald Trump", "boston". +The numbers below are based on the following search terms: "apify", "Donald Trump", "AI Agents". Results were averaged for the three queries. -| Memory (GB) | Max results | Latency (sec) | -|-------------|-------------|---------------| -| 4 | 1 | 22 | -| 4 | 3 | 31 | -| 8 | 1 | 16 | -| 8 | 3 | 17 | +| Memory (GB) | Scraping Tool | Max Results | Latency (sec) | +|-------------|--------------------|-------------|---------------| +| 8 | raw-http | 1 | 3.4 | +| 8 | browser-playwright | 1 | 8.9 | +| 8 | raw-http | 3 | 5.4 | +| 8 | browser-playwright | 3 | 13.6 | -Please note the these results are only indicative and may vary based on the search term, target websites, and network latency. +| Memory (GB) | Scraping Tool | Max Results | Latency (sec) | +|-------------|--------------------|-------------|---------------| +| 4 | raw-http | 1 | 4.1 | +| 4 | raw-http | 3 | 4.8 | +| 4 | browser-playwright | 1 | 16.5 | +| 4 | browser-playwright | 3 | 20.6 | + +Please note that these results are only indicative and may vary based on the search term, target websites, and network latency. ## 💰 Pricing The RAG Web Browser is free of charge, and you only pay for the Apify platform consumption when it runs. The main driver of the price is the Actor compute units (CUs), which are proportional to the amount of Actor run memory and run time (1 CU = 1 GB memory x 1 hour). +Another thing to consider is proxy traffic; residential proxies are more expensive than datacenter proxies. ## ⓘ Limitations and feedback diff --git a/src/const.ts b/src/const.ts index 64f1015..e27b8df 100644 --- a/src/const.ts +++ b/src/const.ts @@ -21,6 +21,7 @@ export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60; // Default values parsed from input_schema.json export const defaults = { + blockMedia: inputSchema.properties.blockMedia.default, debugMode: inputSchema.properties.debugMode.default, dynamicContentWaitSecs: inputSchema.properties.dynamicContentWaitSecs.default, htmlTransformer: inputSchema.properties.htmlTransformer.default, @@ -37,12 +38,11 @@ export const defaults = { query: undefined, // No default value in input_schema.json readableTextCharThreshold: 100, // Not in input_schema.json removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default, - blockMedia: inputSchema.properties.blockMedia.default, removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default, requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default, requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum, + scrapingTool: inputSchema.properties.scrapingTool.default, serpMaxRetries: inputSchema.properties.serpMaxRetries.default, serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum, serpProxyGroup: inputSchema.properties.serpProxyGroup.default, - scrapingTool: inputSchema.properties.scrapingTool.default, }; diff --git a/src/crawlers.ts b/src/crawlers.ts index eb28d20..5689c16 100644 --- a/src/crawlers.ts +++ b/src/crawlers.ts @@ -75,6 +75,7 @@ export async function createAndStartSearchCrawler( responseId, request.userData.contentScraperSettings!, request.userData.timeMeasures!, + request.userData.blockMedia, ); await addContentCrawlRequest(r, responseId, request.userData.contentCrawlerKey!); } @@ -130,6 +131,31 @@ export async function createAndStartContentCrawler( return { key, crawler }; } +/** + * PreNavigation hook that blocks resources based on the blockMedia setting + * from the request's userData. + * Only blocks resources if blockMedia is true. + */ +async function blockMediaResourcesHook({ page, request }: PlaywrightCrawlingContext) { + await page.route('**/*', async (route) => { + const resourceType = route.request().resourceType(); + const url = route.request().url(); + + // Block if it's an image/video/css resource type or has an image/video extension + if (request.userData.blockMedia && ( + resourceType === 'image' + || resourceType === 'video' + || resourceType === 'media' + || resourceType === 'stylesheet' + || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) + )) { + await route.abort(); + } else { + await route.continue(); + } + }); +} + async function createPlaywrightContentCrawler( crawlerOptions: PlaywrightCrawlerOptions, key: string, @@ -143,6 +169,11 @@ async function createPlaywrightContentCrawler( await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext); }, failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.PLAYWRIGHT), + preNavigationHooks: [ + async (context) => { + await blockMediaResourcesHook(context as unknown as PlaywrightCrawlingContext); + }, + ], }); } diff --git a/src/input.ts b/src/input.ts index a44a2ad..15e94e3 100644 --- a/src/input.ts +++ b/src/input.ts @@ -46,8 +46,13 @@ async function processInputInternal( if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') { originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[]; } - const input = { ...defaults, ...originalInput } as Input; + // noinspection SuspiciousTypeOfGuard + if (typeof originalInput.blockMedia === 'string') { + originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1'; + } + + const input = { ...defaults, ...originalInput } as Input; validateAndFillInput(input, standbyInit); const { @@ -111,27 +116,6 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration maxConcurrency, minConcurrency, }, - preNavigationHooks: input.blockMedia ? [ - async ({ page }) => { - await page.route('**/*', async (route) => { - const resourceType = route.request().resourceType(); - const url = route.request().url(); - - // Block if it's an image/video/css resource type or has an image/video extension - if ( - resourceType === 'image' - || resourceType === 'video' - || resourceType === 'media' - || resourceType === 'stylesheet' - || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) - ) { - await route.abort(); - } else { - await route.continue(); - } - }); - }, - ] : [], }, }; } @@ -207,4 +191,8 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) { if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') { throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.'); } + // handle case when blockMedia is not defined, coerce blockMedia to boolean + if (input.blockMedia === undefined || input.blockMedia === null) { + input.blockMedia = defaults.blockMedia; + } } diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts new file mode 100644 index 0000000..df432da --- /dev/null +++ b/src/performance-evaluation.ts @@ -0,0 +1,142 @@ +/** + * Performance evaluation of the RAG Web Browser with respect to different settings. + * This script runs a series of queries and saves performance results into a dataset. + * The results include average time for each time measure event. + * + * The evaluation is performed with different combinations of the following parameters: + * - `scrapingTool`: The tool used for scraping (e.g., `raw-http`, `browser-playwright`). + * - `mediaBlocked`: Whether media content is blocked during scraping (true/false). + * - `maxResults`: The maximum number of results to scrape (e.g., 1, 3). + * + * The script performs the following steps: + * 1. Runs a set of predefined queries using different combinations of parameters. + * 2. Fetches the results and computes the average time for each time measure event. + * 3. Logs the performance results, including average latency for each combination of parameters. + * 4. Aborts the last run of the actor to ensure no resources are wasted. + * + * The results are stored in a table format, showing the average latency for each combination of parameters. + * + * Usage: + * - Ensure the `APIFY_TOKEN` environment variable is set with your Apify API token. + * - Run the script to perform the performance evaluation. + * - The results will be logged to the console. + */ + +import { log } from 'apify'; + +import { Output } from './types'; + +const EVALUATION_QUERIES = [ + 'apify', + 'donald trump', + 'ai agents', +]; + +const apifyToken = process.env.APIFY_TOKEN; + +const user = 'jiri-spilka'; +const actorId = 'apify~rag-web-browser'; +const urlUserActor = `${user}--rag-web-browser-task`; + +const memory = 8; // memory can't be changed in the standby mode +const scrapingToolSet = ['raw-http', 'browser-playwright']; +const mediaBlockedSet = [true, false]; +const maxResultsSet = [1, 3]; + +const url = `https://${urlUserActor}.apify.actor`; + +const headers = { + Accept: 'application/json', + Authorization: `Bearer ${apifyToken}`, +}; + +const results = new Map(); +const resultsTable = []; + +for (const scrapingTool of scrapingToolSet) { + for (const blockMedia of mediaBlockedSet) { + for (const maxResults of maxResultsSet) { + log.info(`Running ${EVALUATION_QUERIES.length} query/queries with ${scrapingTool}, mediaBlocked=${blockMedia}, maxResults=${maxResults}`); + log.info('Start in standby mode'); + const r1 = await fetch(url, { method: 'GET', headers }); + if (!r1.ok) { + throw new Error(`Failed to run the actor: ${JSON.stringify(await r1.json())}`); + } else { + // sleep for 10 seconds to let the actor start + await new Promise((resolve) => setTimeout(resolve, 10000)); + } + for (const q of EVALUATION_QUERIES) { + const queryParams = new URLSearchParams({ query: q, scrapingTool, blockMedia: blockMedia.toString(), debugMode: 'true', maxResults: maxResults.toString() }); + const urlWithParams = `${url}/search?${queryParams.toString()}`; + log.info(`Running ${urlWithParams}`); + const res = await fetch(urlWithParams, { method: 'GET', headers }); + if (!res.ok) { + throw new Error(`Failed to run the actor: ${JSON.stringify(await res.json())}`); + } + const data: Output[] = await res.json(); + log.info(`Received number of results: ${data.length}`); + const k = `${scrapingTool}__${blockMedia ? 'blocked' : 'allowed'}__${maxResults}`; + if (results.has(k)) { + results.set(k, [...results.get(k)!, ...data]); + } else { + results.set(k, data); + } + } + log.info(`Get the last run: ${actorId}`); + const response = await fetch(`https://api.apify.com/v2/acts/${actorId}/runs/last`, { headers }); + const resp = await response.json(); + const { id: runId } = resp.data; + + // it is better to abort run not to mix results and involve autoscaling into the mix + log.info(`Abort run ${runId}`); + const r = await fetch(`https://api.apify.com/v2/actor-runs/${runId}/abort`, { method: 'POST', headers }); + log.info(`The last run has been aborted status=${r.status}`); + } + } +} + +for (const [key, data] of results) { + const remoteDataset = data; + log.info('Compute average time for each time measure event'); + const timeMeasuresMap = new Map(); + const timeMeasuresTimeTaken = []; + + // compute average time for the timeMeasures + for (const item of remoteDataset) { + const { timeMeasures } = item.crawl.debug ?? {}; + if (!timeMeasures) { + continue; + } + for (const measure of timeMeasures) { + if (!timeMeasuresMap.has(measure.event)) { + timeMeasuresMap.set(measure.event, []); + } + timeMeasuresMap.set(measure.event, [...timeMeasuresMap.get(measure.event)!, measure.timeDeltaPrevMs]); + if (measure.event === 'playwright-before-response-send' || measure.event === 'cheerio-before-response-send') { + timeMeasuresTimeTaken.push(measure.timeMs); + } + } + } + log.info(`Performance for key: ${key}`); + log.info('Average time for each time measure event:', timeMeasuresMap); + + for (const [k, value] of timeMeasuresMap) { + const sum = value.reduce((a, b) => a + b, 0); + const avg = sum / value.length; + log.info(`${k}: ${avg.toFixed(0)} ms`); + } + + const avgLatency = timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length / 1000; + log.info('Time taken for each request:', timeMeasuresTimeTaken); + log.info('Time taken on average', { average: avgLatency.toFixed(1) }); + + // Store results for the table + const [scrapingTool, mediaBlocked, maxResults] = key.split('__'); + resultsTable.push(`| ${memory} | ${scrapingTool} | ${mediaBlocked} | ${maxResults} | ${avgLatency.toFixed(1)} |`); +} + +// Print the results table +log.info('\nPerformance Results:'); +log.info('| Memory (GB) | Scraping Tool | Media | Max Results | Latency (sec) |'); +log.info('|-------------|---------------|---------------|-------------|---------------|'); +resultsTable.forEach((row) => log.info(row)); diff --git a/src/performance-measures.ts b/src/performance-measures.ts deleted file mode 100644 index 77fa360..0000000 --- a/src/performance-measures.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { Actor } from 'apify'; - -/** - * Compute average time for each time measure event - */ - -// const datasetId = 'aDnsnaBqGb8eTdpGv'; // 2GB, maxResults=1 -// const datasetId = 'giAPLL8dhd2PDqPlf'; // 2GB, maxResults=5 -// const datasetId = 'VKzel6raVqisgIYfe'; // 4GB, maxResults=1 -// const datasetId = 'KkTaLd70HbFgAO35y'; // 4GB, maxResults=3 -// const datasetId = 'fm9tO0GDBUagMT0df'; // 4GB, maxResults=5 -// const datasetId = '6ObH057Icr9z1bgXl'; // 8GB, maxResults=1 -const datasetId = 'lfItikr0vAXv7oXwH'; // 8GB, maxResults=3 - -// set environment variables APIFY_TOKEN -process.env.APIFY_TOKEN = ''; - -const dataset = await Actor.openDataset(datasetId, { forceCloud: true }); -const remoteDataset = await dataset.getData(); - -const timeMeasuresMap = new Map(); -const timeMeasuresTimeTaken = []; - -// compute average time for the timeMeasures -for (const item of remoteDataset.items) { - const { timeMeasures } = item.crawl.debug; - - for (const measure of timeMeasures) { - if (!timeMeasuresMap.has(measure.event)) { - timeMeasuresMap.set(measure.event, []); - } - timeMeasuresMap.set(measure.event, [...timeMeasuresMap.get(measure.event)!, measure.timeDeltaPrevMs]); - - if (measure.event === 'playwright-before-response-send') { - timeMeasuresTimeTaken.push(measure.timeMs); - } - } -} -// eslint-disable-next-line no-console -console.log('Average time for each time measure event:', timeMeasuresMap); - -for (const [key, value] of timeMeasuresMap) { - const sum = value.reduce((a, b) => a + b, 0); - const avg = sum / value.length; - // eslint-disable-next-line no-console - console.log(`${key}: ${avg.toFixed(0)} s`); -} - -// eslint-disable-next-line no-console -console.log('Time taken for each request:', timeMeasuresTimeTaken); -// eslint-disable-next-line no-console -console.log('Time taken on average', timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length); diff --git a/src/search.ts b/src/search.ts index 306f7f4..435765e 100644 --- a/src/search.ts +++ b/src/search.ts @@ -39,6 +39,7 @@ function prepareRequest( responseId, contentScraperSettings, null, + input.blockMedia, ) : createSearchRequest( query, @@ -47,6 +48,7 @@ function prepareRequest( contentCrawlerKey, searchCrawlerOptions.proxyConfiguration, contentScraperSettings, + input.blockMedia, ); addTimeMeasureEvent(req.userData!, 'request-received', Date.now()); diff --git a/src/types.ts b/src/types.ts index 21c5642..5c4d8ab 100644 --- a/src/types.ts +++ b/src/types.ts @@ -37,6 +37,7 @@ export type Input = { export type StandbyInput = Input & { outputFormats: OutputFormats[] | string + blockMedia: boolean | string; } export type OrganicResult = { @@ -79,6 +80,7 @@ export type SearchCrawlerUserData = { contentCrawlerKey: string; responseId: string; contentScraperSettings: ContentScraperSettings; + blockMedia: boolean; }; export type ContentCrawlerUserData = { @@ -88,6 +90,7 @@ export type ContentCrawlerUserData = { searchResult?: OrganicResult; contentCrawlerKey?: string; contentScraperSettings: ContentScraperSettings; + blockMedia?: boolean; }; export interface ContentScraperSettings { @@ -113,7 +116,9 @@ export type Output = { loadedAt?: Date; requestStatus: string; uniqueKey: string; - debug?: unknown; + debug?: { + timeMeasures?: TimeMeasure[]; + }; }; searchResult: OrganicResult; metadata: { diff --git a/src/utils.ts b/src/utils.ts index 4028e28..c0106c5 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -48,6 +48,7 @@ export function createSearchRequest( contentCrawlerKey: string, proxyConfiguration: ProxyConfiguration | undefined, contentScraperSettings: ContentScraperSettings, + blockMedia: boolean, ): RequestOptions { // add some overhead for the maxResults to account for the fact that some results are not Organic const n = Number(maxResults) + 5; @@ -67,6 +68,7 @@ export function createSearchRequest( contentCrawlerKey, contentScraperSettings, responseId, + blockMedia, }, }; } @@ -80,6 +82,7 @@ export function createRequest( responseId: string, contentScraperSettings: ContentScraperSettings, timeMeasures: TimeMeasure[] | null = null, + blockMedia: boolean = false, ): RequestOptions { return { url: result.url!, @@ -90,6 +93,7 @@ export function createRequest( searchResult: result.url && result.title ? result : undefined, timeMeasures: timeMeasures ? [...timeMeasures] : [], contentScraperSettings, + blockMedia, }, }; }