Skip to content

Commit e0b52c1

Browse files
committed
feat: add worker types
1 parent 38539d5 commit e0b52c1

File tree

2 files changed

+216
-0
lines changed

2 files changed

+216
-0
lines changed

maxun-core/src/types/worker.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
export interface WorkerConfig {
2+
workerIndex: number;
3+
startIndex: number;
4+
endIndex: number;
5+
batchSize: number;
6+
pageUrls: string[];
7+
listSelector: string;
8+
fields: any;
9+
pagination: {
10+
type: string;
11+
selector: string;
12+
};
13+
}
14+
15+
export interface SharedState {
16+
totalScraped: number;
17+
results: any[];
18+
}
19+
20+
export interface WorkerProgressData {
21+
percentage: number;
22+
currentUrl: string;
23+
scrapedItems: number;
24+
timeElapsed: number;
25+
estimatedTimeRemaining: number;
26+
failures: number;
27+
performance: PerformanceMetrics;
28+
}
29+
30+
export interface PerformanceMetrics {
31+
startTime: number;
32+
endTime: number;
33+
duration: number;
34+
pagesProcessed: number;
35+
itemsScraped: number;
36+
failedPages: number;
37+
averageTimePerPage: number;
38+
memoryUsage: {
39+
heapUsed: number;
40+
heapTotal: number;
41+
external: number;
42+
rss: number;
43+
};
44+
cpuUsage: {
45+
user: number;
46+
system: number;
47+
};
48+
}
49+
50+
export interface GlobalMetrics {
51+
totalPagesProcessed: number;
52+
totalItemsScraped: number;
53+
totalFailures: number;
54+
workersActive: number;
55+
averageSpeed: number;
56+
timeElapsed: number;
57+
memoryUsage: NodeJS.MemoryUsage;
58+
cpuUsage: NodeJS.CpuUsage;
59+
}

maxun-core/src/utils/worker.ts

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import { parentPort, workerData } from 'worker_threads';
2+
import { chromium, Browser, Page } from 'playwright';
3+
import path from 'path';
4+
import type { WorkerConfig, SharedState } from '../types/worker';
5+
6+
async function initializeBrowser(): Promise<Browser> {
7+
return await chromium.launch({
8+
headless: true,
9+
args: [
10+
"--disable-blink-features=AutomationControlled",
11+
"--disable-web-security",
12+
"--disable-features=IsolateOrigins,site-per-process",
13+
"--disable-site-isolation-trials",
14+
"--disable-extensions",
15+
"--no-sandbox",
16+
"--disable-dev-shm-usage",
17+
]
18+
});
19+
}
20+
21+
async function ensureScriptsLoaded(page: Page) {
22+
const isScriptLoaded = await page.evaluate(() =>
23+
typeof window.scrape === 'function' &&
24+
typeof window.scrapeSchema === 'function' &&
25+
typeof window.scrapeList === 'function' &&
26+
typeof window.scrapeListAuto === 'function' &&
27+
typeof window.scrollDown === 'function' &&
28+
typeof window.scrollUp === 'function'
29+
);
30+
31+
if (!isScriptLoaded) {
32+
await page.addInitScript({
33+
path: path.join(__dirname, '..', 'browserSide', 'scraper.js')
34+
});
35+
}
36+
}
37+
38+
async function scrapeBatch(config: WorkerConfig, sharedState: SharedState) {
39+
const results: any[] = [];
40+
const scrapedItems = new Set<string>();
41+
let browser: Browser | null = null;
42+
let page: Page | null = null;
43+
44+
try {
45+
browser = await initializeBrowser();
46+
const context = await browser.newContext();
47+
page = await context.newPage();
48+
await ensureScriptsLoaded(page);
49+
50+
for (const [pageIndex, pageUrl] of config.pageUrls.entries()) {
51+
const pageStartTime = Date.now();
52+
53+
try {
54+
// Report progress to main thread
55+
parentPort?.postMessage({
56+
type: 'progress',
57+
data: {
58+
workerId: config.workerIndex,
59+
currentUrl: pageUrl,
60+
processedUrls: pageIndex,
61+
totalUrls: config.pageUrls.length,
62+
timeElapsed: Date.now() - pageStartTime,
63+
scrapedItems: results.length
64+
}
65+
});
66+
67+
const navigationResult = await page.goto(pageUrl, {
68+
waitUntil: 'networkidle',
69+
timeout: 30000
70+
});
71+
72+
if (!navigationResult) continue;
73+
74+
await page.waitForLoadState('networkidle').catch(() => {});
75+
76+
const scrapeConfig = {
77+
listSelector: config.listSelector,
78+
fields: config.fields,
79+
pagination: config.pagination,
80+
limit: config.endIndex - config.startIndex - results.length
81+
};
82+
83+
const pageResults = await page.evaluate(
84+
(cfg) => window.scrapeList(cfg),
85+
scrapeConfig
86+
);
87+
88+
// Filter out duplicates
89+
const newResults = pageResults.filter(item => {
90+
const uniqueKey = JSON.stringify(item);
91+
92+
// Check against local duplicates
93+
if (scrapedItems.has(uniqueKey)) return false;
94+
95+
// Check against shared state results
96+
const isDuplicate = sharedState.results.some(
97+
existingItem => JSON.stringify(existingItem) === uniqueKey
98+
);
99+
100+
if (isDuplicate) return false;
101+
scrapedItems.add(uniqueKey);
102+
sharedState.results.push(item);
103+
sharedState.totalScraped++;
104+
return true;
105+
});
106+
107+
results.push(...newResults);
108+
109+
if (results.length >= config.batchSize) break;
110+
111+
await page.waitForTimeout(1000);
112+
113+
} catch (error) {
114+
parentPort?.postMessage({
115+
type: 'error',
116+
data: {
117+
workerId: config.workerIndex,
118+
url: pageUrl,
119+
error: error.message
120+
}
121+
});
122+
continue;
123+
}
124+
}
125+
126+
return results;
127+
128+
} catch (error) {
129+
throw error;
130+
} finally {
131+
if (page) await page.close();
132+
if (browser) await browser.close();
133+
}
134+
}
135+
136+
// Handle worker initialization
137+
if (parentPort) {
138+
const config: WorkerConfig = workerData.config;
139+
const sharedState: SharedState = workerData.sharedState;
140+
141+
scrapeBatch(config, sharedState)
142+
.then(results => {
143+
parentPort?.postMessage({
144+
type: 'complete',
145+
data: results
146+
});
147+
})
148+
.catch(error => {
149+
parentPort?.postMessage({
150+
type: 'error',
151+
data: {
152+
workerId: config.workerIndex,
153+
error: error.message
154+
}
155+
});
156+
});
157+
}

0 commit comments

Comments
 (0)