Skip to content

feat: unbloat, reduce image size #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 26 additions & 46 deletions .actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,65 +1,45 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:22-1.46.0 AS builder
FROM node:22-bookworm AS builder

WORKDIR /app

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
COPY package*.json ./

# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false
RUN npm ci

# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./
# Copy the source files.
COPY . ./

# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build

# Create final image
FROM apify/actor-node-playwright-firefox:22-1.46.0

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install Playwright and Firefox.
RUN npx -y playwright install --with-deps firefox

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Move the installed browser to a separate directory.
RUN mkdir -p /app/ms-playwright \
&& mv $(find ~/.cache/ms-playwright -type d -name "firefox" -path "*/firefox" | head -n 1) /app/ms-playwright/firefox

# Remove the existing firefox installation
RUN rm -rf ${PLAYWRIGHT_BROWSERS_PATH}/*
# Move Firefox libs to a separate directory for later copy.
RUN apt-get -y install strace && \
mkdir -p /firefox/lib/ && \
strace -e openat /app/ms-playwright/firefox/firefox 2>&1 | grep /lib/x86 | awk -F'"' '{print $2}' | xargs -I {} cp {} /firefox/lib/

# Install all required playwright dependencies for firefox
RUN npx playwright install firefox
# symlink the firefox binary to the root folder in order to bypass the versioning and resulting browser launch crashes.
RUN ln -s ${PLAYWRIGHT_BROWSERS_PATH}/firefox-*/firefox/firefox ${PLAYWRIGHT_BROWSERS_PATH}/
# Create final image
FROM gcr.io/distroless/nodejs22-debian12

# Overrides the dynamic library used by Firefox to determine trusted root certificates with p11-kit-trust.so, which loads the system certificates.
RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so
RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so
WORKDIR /app

# Copy built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Copy the node_modules and built app from the build stage
COPY --from=builder /app /app

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Copy Firefox libs
COPY --from=builder /firefox/lib/* /lib/x86_64-linux-gnu/

# Disable experimental feature warning from Node.js
ENV NODE_NO_WARNINGS=1
# Set the default browser path to the installed Firefox.
ENV APIFY_DEFAULT_BROWSER_PATH=/app/ms-playwright/firefox/firefox

# Run the image.
CMD npm run start:prod --silent
CMD ["/app/dist/src/main.js", "--silent"]
194 changes: 194 additions & 0 deletions benchmark/runtime.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
/*
Benchmark for Actor runtime

This benchmark was mainly created for testing Actor run time performance to compare previous and distroless build.
*/
import { ActorRun, ApifyClient } from 'apify-client';

// Configuration constants
const API_TOKEN = process.env.APIFY_TOKEN;
//const ACTOR_NAME = 'apify/rag-web-browser';
const ACTOR_NAME = 'jakub.kopecky/rag-web-browser';
const MAX_RUNS = 500;
const MAX_MEMORY_GB = 64; // Total memory available
const ACTOR_MEMORY_GB = 1; // Memory per actor run
const ACTOR_INPUT = {
query: 'apify ai',
maxResults: 1,
};

// Initialize Apify client
const client = new ApifyClient({ token: API_TOKEN });

async function computeLogTimes(run: ActorRun): Promise<{
pullToStartTime: number;
startToSystemTime: number;
}> {
const log = await client.run(run.id).log().get();
if (!log) {
throw new Error(`Failed to get logs for run ${run.id}`);
}

const lines = log.split('\n');
// get initial
const startTimeStr = lines[0].split(' ')[0];
const startTime = new Date(startTimeStr).getTime();

// starting container
const startContainerLine = lines.find(line => line.includes('Starting Docker container'));
if (!startContainerLine) {
throw new Error(`Failed to find start container line in logs for run ${run.id}`);
}
const startContainerTimeStr = startContainerLine?.split(' ')[0];
const startContainerTime = new Date(startContainerTimeStr).getTime();

// system info
const systemInfoLine = lines.find(line => line.includes('System info'));
if (!systemInfoLine) {
throw new Error(`Failed to find system info line in logs for run ${run.id}`);
}
const systemInfoTimeStr = systemInfoLine?.split(' ')[0];
const systemInfoTime = new Date(systemInfoTimeStr).getTime();

// Calculate times
const pullToStartTime = (startContainerTime - startTime) / 1000; // in seconds
const startToSystemTime = (systemInfoTime - startContainerTime) / 1000; // in seconds

return {
pullToStartTime,
startToSystemTime,
};
}

async function waitForRunFinishAndHandle(concurrentRunIDs: string[]): Promise<{
pullToStartTime: number;
startToSystemTime: number;
run: ActorRun;
}> {
//const run = await client.run(concurrentRunIDs[0]).waitForFinish();
let runid: string | undefined;
while (!runid) {
runid = concurrentRunIDs.find(id => async () => {
const run = await client.run(id).get();
if (!run) {
throw new Error(`Failed to get run ${id}`);
}
return run.status === 'SUCCEEDED' || run.status === 'FAILED';
});
if (!runid) {
// sleep
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
const run = await client.run(runid).waitForFinish();

console.log(`Run ${run.id} finished in ${run.stats.runTimeSecs} seconds`);
const { pullToStartTime, startToSystemTime } = await computeLogTimes(run);
concurrentRunIDs.shift(); // Remove the finished run from the list

return {
pullToStartTime,
startToSystemTime,
run,
}
}

async function runBenchmark() {
// Calculate max concurrent runs to avoid memory overload
const maxConcurrentRuns = Math.floor(MAX_MEMORY_GB / ACTOR_MEMORY_GB);
console.log(`Starting ${MAX_RUNS} runs with ${ACTOR_MEMORY_GB}GB per run, max ${maxConcurrentRuns} concurrent`);

// Track runs
const finishedRuns: ActorRun[] = [];
const concurrentRunIDs: string[] = [];

// Extracted times from logs
// Time to pull the Actor container
const logPullToStartTimes: number[] = [];
// Time from starting the container to the first system log
const logStartToSystemTimes: number[] = [];

// Actor run loop
while (finishedRuns.length + concurrentRunIDs.length < MAX_RUNS) {
if (concurrentRunIDs.length >= maxConcurrentRuns) {
const { pullToStartTime, startToSystemTime, run } = await waitForRunFinishAndHandle(concurrentRunIDs);
finishedRuns.push(run);
logPullToStartTimes.push(pullToStartTime);
logStartToSystemTimes.push(startToSystemTime);
}

const run = await client.actor(ACTOR_NAME).start(ACTOR_INPUT, { memory: ACTOR_MEMORY_GB * 1024 });
console.log(`Started run ${run.id} with ${ACTOR_MEMORY_GB}GB memory (${finishedRuns.length} finished, ${concurrentRunIDs.length} concurrent)`);
concurrentRunIDs.push(run.id);
}
// Wait for remaining runs to finish
while (concurrentRunIDs.length > 0) {
const { pullToStartTime, startToSystemTime, run } = await waitForRunFinishAndHandle(concurrentRunIDs);
finishedRuns.push(run);
logPullToStartTimes.push(pullToStartTime);
logStartToSystemTimes.push(startToSystemTime);
}

// Calculate run times
const runTimes: number[] = finishedRuns.map(run => run.stats.runTimeSecs);

// Log pull to start times
const averagePullToStartTime = logPullToStartTimes.reduce((sum, time) => sum + time, 0) / logPullToStartTimes.length;
const medianPullToStartTime = (() => {
const sorted = [...logPullToStartTimes].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
})();
const minPullToStartTime = Math.min(...logPullToStartTimes);
const maxPullToStartTime = Math.max(...logPullToStartTimes);
const stdDevPullToStartTime = Math.sqrt(logPullToStartTimes.reduce((sum, time) => sum + Math.pow(time - averagePullToStartTime, 2), 0) / logPullToStartTimes.length);

// Log start to system times
const averageStartToSystemTime = logStartToSystemTimes.reduce((sum, time) => sum + time, 0) / logStartToSystemTimes.length;
const medianStartToSystemTime = (() => {
const sorted = [...logStartToSystemTimes].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
})();
const minStartToSystemTime = Math.min(...logStartToSystemTimes);
const maxStartToSystemTime = Math.max(...logStartToSystemTimes);
const stdDevStartToSystemTime = Math.sqrt(logStartToSystemTimes.reduce((sum, time) => sum + Math.pow(time - averageStartToSystemTime, 2), 0) / logStartToSystemTimes.length);

// Compute average run time
const averageRunTime = runTimes.reduce((sum, time) => sum + time, 0) / runTimes.length;
const medianRunTime = (() => {
const sorted = [...runTimes].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
})();
const minRunTime = Math.min(...runTimes);
const maxRunTime = Math.max(...runTimes);
const stdDevRunTime = Math.sqrt(runTimes.reduce((sum, time) => sum + Math.pow(time - averageRunTime, 2), 0) / runTimes.length);

// Log results
console.log(`Completed ${MAX_RUNS} runs`);
console.log('------------------------------------------------------')
console.log(`Average pull to start time: ${averagePullToStartTime.toFixed(2)} seconds`);
console.log(`Median pull to start time: ${medianPullToStartTime.toFixed(2)} seconds`);
console.log(`Min pull to start time: ${minPullToStartTime.toFixed(2)} seconds`);
console.log(`Max pull to start time: ${maxPullToStartTime.toFixed(2)} seconds`);
console.log(`Standard deviation of pull to start times: ${stdDevPullToStartTime.toFixed(2)} seconds`);
console.log('------------------------------------------------------')
console.log(`Average start to system time: ${averageStartToSystemTime.toFixed(2)} seconds`);
console.log(`Median start to system time: ${medianStartToSystemTime.toFixed(2)} seconds`);
console.log(`Min start to system time: ${minStartToSystemTime.toFixed(2)} seconds`);
console.log(`Max start to system time: ${maxStartToSystemTime.toFixed(2)} seconds`);
console.log(`Standard deviation of start to system times: ${stdDevStartToSystemTime.toFixed(2)} seconds`);
console.log('------------------------------------------------------')
console.log(`Average total run time: ${averageRunTime.toFixed(2)} seconds`);
console.log(`Median total run time: ${medianRunTime.toFixed(2)} seconds`);
console.log(`Min total run time: ${minRunTime.toFixed(2)} seconds`);
console.log(`Max total run time: ${maxRunTime.toFixed(2)} seconds`);
console.log(`Standard deviation of total run times: ${stdDevRunTime.toFixed(2)} seconds`);
}

// Execute benchmark
runBenchmark().catch(error => {
console.error('Benchmark failed:', error.message);
process.exit(1);
});