4
4
import re
5
5
from asyncio import Task
6
6
from functools import partial
7
- from typing import Any
7
+ from typing import Any , Union
8
8
9
- from pydantic import BaseModel , Field
9
+ from pydantic import BaseModel , Field , TypeAdapter
10
10
11
11
from crawlee import Request
12
12
from crawlee ._types import HttpMethod
13
13
from crawlee .http_clients import BaseHttpClient , HttpxHttpClient
14
- from crawlee .storages import RequestList
15
-
16
- from ._known_actor_input_keys import ActorInputKeys
14
+ from crawlee .storages import RequestList as CrawleeRequestList
17
15
18
16
URL_NO_COMMAS_REGEX = re .compile (
19
17
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
@@ -24,50 +22,63 @@ class _RequestDetails(BaseModel):
24
22
method : HttpMethod = 'GET'
25
23
payload : str = ''
26
24
headers : dict [str , str ] = Field (default_factory = dict )
27
- user_data : dict [str , str ] = Field (default_factory = dict , alias = ActorInputKeys . startUrls . userData )
25
+ user_data : dict [str , str ] = Field (default_factory = dict , alias = ' userData' )
28
26
29
27
30
28
class _RequestsFromUrlInput (_RequestDetails ):
31
- requests_from_url : str = Field (alias = ActorInputKeys . startUrls . requestsFromUrl )
29
+ requests_from_url : str = Field (alias = ' requestsFromUrl' )
32
30
33
31
34
32
class _SimpleUrlInput (_RequestDetails ):
35
33
url : str
36
34
37
35
38
- async def create_request_list (
39
- actor_start_urls_input : list [dict [str , Any ]], http_client : BaseHttpClient | None = None
36
+ url_input_adapter = TypeAdapter (list [Union [_RequestsFromUrlInput , _SimpleUrlInput ]])
37
+
38
+
39
+ class RequestList (CrawleeRequestList ):
40
+ """Extends crawlee RequestList."""
41
+
42
+ @classmethod
43
+ async def open (
44
+ cls ,
45
+ name : str | None = None ,
46
+ actor_start_urls_input : list [dict [str , Any ]] | None = None ,
47
+ http_client : BaseHttpClient | None = None ,
48
+ ) -> RequestList :
49
+ """Creates RequestList from Actor input requestListSources.
50
+
51
+ name is name of the returned RequestList
52
+ actor_start_urls_input can contain list dicts with either url or requestsFromUrl key
53
+ http_client is client that will be used to send get request to url defined in requestsFromUrl
54
+
55
+ Example actor_start_urls_input:
56
+ [
57
+ # Gather urls from response body.
58
+ {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
59
+ # Directly include this url.
60
+ {'url': 'https://crawlee.dev', 'method': 'GET'}
61
+ ]
62
+ """
63
+ actor_start_urls_input = actor_start_urls_input or []
64
+ return await _create_request_list (name , actor_start_urls_input , http_client )
65
+
66
+
67
+ async def _create_request_list (
68
+ name : str | None , actor_start_urls_input : list [dict [str , Any ]], http_client : BaseHttpClient | None
40
69
) -> RequestList :
41
- """Creates RequestList from Actor input requestListSources.
42
-
43
- actor_start_urls_input can contain list dicts with either url or requestsFromUrl key
44
- http_client is client that will be used to send get request to url defined in requestsFromUrl
45
-
46
- Example:
47
- actor_start_urls_input = [
48
- # Gather urls from response body.
49
- {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
50
- # Directly include this url.
51
- {'url': 'https://crawlee.dev', 'method': 'GET'}
52
- ]
53
- """
54
70
if not http_client :
55
71
http_client = HttpxHttpClient ()
56
- simple_url_requests_inputs = [
57
- _SimpleUrlInput (** request_input )
58
- for request_input in actor_start_urls_input
59
- if ActorInputKeys .startUrls .url in request_input
60
- ]
61
- remote_url_requests_inputs = [
62
- _RequestsFromUrlInput (** request_input )
63
- for request_input in actor_start_urls_input
64
- if ActorInputKeys .startUrls .requestsFromUrl in request_input
65
- ]
66
72
67
- simple_url_requests = _create_requests_from_input (simple_url_requests_inputs )
68
- remote_url_requests = await _create_requests_from_url (remote_url_requests_inputs , http_client = http_client )
73
+ ulr_inputs = url_input_adapter .validate_python (actor_start_urls_input ) # instance of list[Union[...]]
74
+
75
+ simple_url_inputs = [url_input for url_input in ulr_inputs if type (url_input ) is _SimpleUrlInput ]
76
+ remote_url_inputs = [url_input for url_input in ulr_inputs if type (url_input ) is _RequestsFromUrlInput ]
77
+
78
+ simple_url_requests = _create_requests_from_input (simple_url_inputs )
79
+ remote_url_requests = await _create_requests_from_url (remote_url_inputs , http_client = http_client )
69
80
70
- return RequestList (requests = simple_url_requests + remote_url_requests )
81
+ return RequestList (name = name , requests = simple_url_requests + remote_url_requests )
71
82
72
83
73
84
def _create_requests_from_input (simple_url_inputs : list [_SimpleUrlInput ]) -> list [Request ]:
0 commit comments