1+ import { parentPort , workerData } from 'worker_threads' ;
2+ import { chromium , Browser , Page } from 'playwright' ;
3+ import path from 'path' ;
4+ import type { WorkerConfig , SharedState } from '../types/worker' ;
5+
6+ async function initializeBrowser ( ) : Promise < Browser > {
7+ return await chromium . launch ( {
8+ headless : true ,
9+ args : [
10+ "--disable-blink-features=AutomationControlled" ,
11+ "--disable-web-security" ,
12+ "--disable-features=IsolateOrigins,site-per-process" ,
13+ "--disable-site-isolation-trials" ,
14+ "--disable-extensions" ,
15+ "--no-sandbox" ,
16+ "--disable-dev-shm-usage" ,
17+ ]
18+ } ) ;
19+ }
20+
21+ async function ensureScriptsLoaded ( page : Page ) {
22+ const isScriptLoaded = await page . evaluate ( ( ) =>
23+ typeof window . scrape === 'function' &&
24+ typeof window . scrapeSchema === 'function' &&
25+ typeof window . scrapeList === 'function' &&
26+ typeof window . scrapeListAuto === 'function' &&
27+ typeof window . scrollDown === 'function' &&
28+ typeof window . scrollUp === 'function'
29+ ) ;
30+
31+ if ( ! isScriptLoaded ) {
32+ await page . addInitScript ( {
33+ path : path . join ( __dirname , '..' , 'browserSide' , 'scraper.js' )
34+ } ) ;
35+ }
36+ }
37+
38+ async function scrapeBatch ( config : WorkerConfig , sharedState : SharedState ) {
39+ const results : any [ ] = [ ] ;
40+ const scrapedItems = new Set < string > ( ) ;
41+ let browser : Browser | null = null ;
42+ let page : Page | null = null ;
43+
44+ try {
45+ browser = await initializeBrowser ( ) ;
46+ const context = await browser . newContext ( ) ;
47+ page = await context . newPage ( ) ;
48+ await ensureScriptsLoaded ( page ) ;
49+
50+ for ( const [ pageIndex , pageUrl ] of config . pageUrls . entries ( ) ) {
51+ const pageStartTime = Date . now ( ) ;
52+
53+ try {
54+ // Report progress to main thread
55+ parentPort ?. postMessage ( {
56+ type : 'progress' ,
57+ data : {
58+ workerId : config . workerIndex ,
59+ currentUrl : pageUrl ,
60+ processedUrls : pageIndex ,
61+ totalUrls : config . pageUrls . length ,
62+ timeElapsed : Date . now ( ) - pageStartTime ,
63+ scrapedItems : results . length
64+ }
65+ } ) ;
66+
67+ const navigationResult = await page . goto ( pageUrl , {
68+ waitUntil : 'networkidle' ,
69+ timeout : 30000
70+ } ) ;
71+
72+ if ( ! navigationResult ) continue ;
73+
74+ await page . waitForLoadState ( 'networkidle' ) . catch ( ( ) => { } ) ;
75+
76+ const scrapeConfig = {
77+ listSelector : config . listSelector ,
78+ fields : config . fields ,
79+ pagination : config . pagination ,
80+ limit : config . endIndex - config . startIndex - results . length
81+ } ;
82+
83+ const pageResults = await page . evaluate (
84+ ( cfg ) => window . scrapeList ( cfg ) ,
85+ scrapeConfig
86+ ) ;
87+
88+ // Filter out duplicates
89+ const newResults = pageResults . filter ( item => {
90+ const uniqueKey = JSON . stringify ( item ) ;
91+
92+ // Check against local duplicates
93+ if ( scrapedItems . has ( uniqueKey ) ) return false ;
94+
95+ // Check against shared state results
96+ const isDuplicate = sharedState . results . some (
97+ existingItem => JSON . stringify ( existingItem ) === uniqueKey
98+ ) ;
99+
100+ if ( isDuplicate ) return false ;
101+ scrapedItems . add ( uniqueKey ) ;
102+ sharedState . results . push ( item ) ;
103+ sharedState . totalScraped ++ ;
104+ return true ;
105+ } ) ;
106+
107+ results . push ( ...newResults ) ;
108+
109+ if ( results . length >= config . batchSize ) break ;
110+
111+ await page . waitForTimeout ( 1000 ) ;
112+
113+ } catch ( error ) {
114+ parentPort ?. postMessage ( {
115+ type : 'error' ,
116+ data : {
117+ workerId : config . workerIndex ,
118+ url : pageUrl ,
119+ error : error . message
120+ }
121+ } ) ;
122+ continue ;
123+ }
124+ }
125+
126+ return results ;
127+
128+ } catch ( error ) {
129+ throw error ;
130+ } finally {
131+ if ( page ) await page . close ( ) ;
132+ if ( browser ) await browser . close ( ) ;
133+ }
134+ }
135+
136+ // Handle worker initialization
137+ if ( parentPort ) {
138+ const config : WorkerConfig = workerData . config ;
139+ const sharedState : SharedState = workerData . sharedState ;
140+
141+ scrapeBatch ( config , sharedState )
142+ . then ( results => {
143+ parentPort ?. postMessage ( {
144+ type : 'complete' ,
145+ data : results
146+ } ) ;
147+ } )
148+ . catch ( error => {
149+ parentPort ?. postMessage ( {
150+ type : 'error' ,
151+ data : {
152+ workerId : config . workerIndex ,
153+ error : error . message
154+ }
155+ } ) ;
156+ } ) ;
157+ }
0 commit comments