@@ -168,14 +168,17 @@ Type `Optional[str]`, default `None`
168
168
The endpoint of a remote Chromium browser to connect using the
169
169
[ Chrome DevTools Protocol] ( https://chromedevtools.github.io/devtools-protocol/ ) ,
170
170
via [ ` BrowserType.connect_over_cdp ` ] ( https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp ) .
171
+
172
+ ``` python
173
+ PLAYWRIGHT_CDP_URL = " http://localhost:9222"
174
+ ```
175
+
171
176
If this setting is used:
172
177
* all non-persistent contexts will be created on the connected remote browser
173
178
* the ` PLAYWRIGHT_LAUNCH_OPTIONS ` setting is ignored
174
179
* the ` PLAYWRIGHT_BROWSER_TYPE ` setting must not be set to a value different than "chromium"
175
180
176
- ``` python
177
- PLAYWRIGHT_CDP_URL = " http://localhost:9222"
178
- ```
181
+ ** This settings CANNOT be used at the same time as ` PLAYWRIGHT_CONNECT_URL ` **
179
182
180
183
### ` PLAYWRIGHT_CDP_KWARGS `
181
184
Type ` dict[str, Any] ` , default ` {} `
@@ -192,6 +195,41 @@ PLAYWRIGHT_CDP_KWARGS = {
192
195
}
193
196
```
194
197
198
+ ### ` PLAYWRIGHT_CONNECT_URL `
199
+ Type ` Optional[str] ` , default ` None `
200
+
201
+ URL of a remote Playwright browser instance to connect using
202
+ [ ` BrowserType.connect ` ] ( https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect ) .
203
+
204
+ From the upstream Playwright docs:
205
+ > When connecting to another browser launched via
206
+ > [ ` BrowserType.launchServer ` ] ( https://playwright.dev/docs/api/class-browsertype#browser-type-launch-server )
207
+ > in Node.js, the major and minor version needs to match the client version (1.2.3 → is compatible with 1.2.x).
208
+
209
+ ``` python
210
+ PLAYWRIGHT_CONNECT_URL = " ws://localhost:35477/ae1fa0bc325adcfd9600d9f712e9c733"
211
+ ```
212
+
213
+ If this setting is used:
214
+ * all non-persistent contexts will be created on the connected remote browser
215
+ * the ` PLAYWRIGHT_LAUNCH_OPTIONS ` setting is ignored
216
+
217
+ ** This settings CANNOT be used at the same time as ` PLAYWRIGHT_CDP_URL ` **
218
+
219
+ ### ` PLAYWRIGHT_CONNECT_KWARGS `
220
+ Type ` dict[str, Any] ` , default ` {} `
221
+
222
+ Additional keyword arguments to be passed to
223
+ [ ` BrowserType.connect ` ] ( https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect )
224
+ when using ` PLAYWRIGHT_CONNECT_URL ` . The ` ws_endpoint ` key is always ignored,
225
+ ` PLAYWRIGHT_CONNECT_URL ` is used instead.
226
+
227
+ ``` python
228
+ PLAYWRIGHT_CONNECT_KWARGS = {
229
+ " slow_mo" : 1000 ,
230
+ " timeout" : 10 * 1000
231
+ }
232
+ ```
195
233
196
234
### ` PLAYWRIGHT_CONTEXTS `
197
235
Type ` dict[str, dict] ` , default ` {} `
@@ -286,6 +324,17 @@ def custom_headers(
286
324
PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers
287
325
```
288
326
327
+ ### ` PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER `
328
+ Type ` bool ` , default ` True `
329
+
330
+ Whether the browser will be restarted if it gets disconnected, for instance if the local
331
+ browser crashes or a remote connection times out.
332
+ Implemented by listening to the
333
+ [ ` disconnected ` Browser event] ( https://playwright.dev/python/docs/api/class-browser#browser-event-disconnected ) ,
334
+ for this reason it does not apply to persistent contexts since
335
+ [ BrowserType.launch_persistent_context] ( https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context )
336
+ returns the context directly.
337
+
289
338
### ` PLAYWRIGHT_MAX_PAGES_PER_CONTEXT `
290
339
Type ` int ` , defaults to the value of Scrapy's ` CONCURRENT_REQUESTS ` setting
291
340
@@ -459,14 +508,16 @@ This key could be used in conjunction with `playwright_include_page` to make a c
459
508
requests using the same page. For instance:
460
509
461
510
``` python
511
+ from playwright.async_api import Page
512
+
462
513
def start_requests (self ):
463
514
yield scrapy.Request(
464
515
url = " https://httpbin.org/get" ,
465
516
meta = {" playwright" : True , " playwright_include_page" : True },
466
517
)
467
518
468
519
def parse (self , response , ** kwargs ):
469
- page = response.meta[" playwright_page" ]
520
+ page: Page = response.meta[" playwright_page" ]
470
521
yield scrapy.Request(
471
522
url = " https://httpbin.org/headers" ,
472
523
callback = self .parse_headers,
@@ -507,6 +558,20 @@ def parse(self, response, **kwargs):
507
558
# {'issuer': 'DigiCert TLS RSA SHA256 2020 CA1', 'protocol': 'TLS 1.3', 'subjectName': 'www.example.org', 'validFrom': 1647216000, 'validTo': 1678838399}
508
559
```
509
560
561
+ ### ` playwright_suggested_filename `
562
+ Type ` Optional[str] ` , read only
563
+
564
+ The value of the [ ` Download.suggested_filename ` ] ( https://playwright.dev/python/docs/api/class-download#download-suggested-filename )
565
+ attribute when the response is the binary contents of a
566
+ [ download] ( https://playwright.dev/python/docs/downloads ) (e.g. a PDF file).
567
+ Only available for responses that only caused a download. Can be accessed
568
+ in the callback via ` response.meta['playwright_suggested_filename'] `
569
+
570
+ ``` python
571
+ def parse (self , response , ** kwargs ):
572
+ print (response.meta[" playwright_suggested_filename" ])
573
+ # 'sample_file.pdf'
574
+ ```
510
575
511
576
## Receiving Page objects in callbacks
512
577
@@ -525,6 +590,7 @@ necessary the spider job could get stuck because of the limit set by the
525
590
` PLAYWRIGHT_MAX_PAGES_PER_CONTEXT ` setting.
526
591
527
592
``` python
593
+ from playwright.async_api import Page
528
594
import scrapy
529
595
530
596
class AwesomeSpiderWithPage (scrapy .Spider ):
@@ -539,7 +605,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
539
605
)
540
606
541
607
def parse_first (self , response ):
542
- page = response.meta[" playwright_page" ]
608
+ page: Page = response.meta[" playwright_page" ]
543
609
return scrapy.Request(
544
610
url = " https://example.com" ,
545
611
callback = self .parse_second,
@@ -548,13 +614,13 @@ class AwesomeSpiderWithPage(scrapy.Spider):
548
614
)
549
615
550
616
async def parse_second (self , response ):
551
- page = response.meta[" playwright_page" ]
617
+ page: Page = response.meta[" playwright_page" ]
552
618
title = await page.title() # "Example Domain"
553
619
await page.close()
554
620
return {" title" : title}
555
621
556
622
async def errback_close_page (self , failure ):
557
- page = failure.request.meta[" playwright_page" ]
623
+ page: Page = failure.request.meta[" playwright_page" ]
558
624
await page.close()
559
625
```
560
626
0 commit comments