1
- # -*- coding: utf-8 -*-
1
+ # -*- coding: utf-8 -*-
2
+ from frontera .core .models import Request
3
+ from frontera .contrib .middlewares .fingerprint import UrlFingerprintMiddleware
4
+
5
+ from abc import ABCMeta , abstractmethod
6
+
7
+
8
+ class BaseCrawlingStrategy (object ):
9
+ """
10
+ Interface definition for a crawling strategy.
11
+
12
+ Before calling these methods strategy worker is adding 'state' key to meta field in every
13
+ :class:`Request <frontera.core.models.Request>` with state of the URL. Pleases refer for the states to HBaseBackend
14
+ implementation.
15
+
16
+ After exiting from all of these methods states from meta field are passed back and stored in the backend.
17
+ """
18
+ __metaclass__ = ABCMeta
19
+
20
+ def __init__ (self , manager , mb_stream , states_context ):
21
+ self ._mb_stream = mb_stream
22
+ self ._states_context = states_context
23
+ self .url_mw = UrlFingerprintMiddleware (manager )
24
+
25
+ @classmethod
26
+ def from_worker (cls , manager , mb_stream , states_context ):
27
+ """
28
+ Called on instantiation in strategy worker.
29
+
30
+ :param manager: :class: `Backend <frontera.core.manager.FrontierManager>` instance
31
+ :param mb_stream: :class: `UpdateScoreStream <frontera.worker.strategy.UpdateScoreStream>` instance
32
+ :return: new instance
33
+ """
34
+ raise cls (manager , mb_stream , states_context )
35
+
36
+ @abstractmethod
37
+ def add_seeds (self , seeds ):
38
+ """
39
+ Called when add_seeds event is received from spider log.
40
+
41
+ :param list seeds: A list of :class:`Request <frontera.core.models.Request>` objects.
42
+ """
43
+ return {}
44
+
45
+ @abstractmethod
46
+ def page_crawled (self , response , links ):
47
+ """
48
+ Called every time document was successfully crawled, and receiving page_crawled event from spider log.
49
+
50
+ :param object response: The :class:`Response <frontera.core.models.Response>` object for the crawled page.
51
+ :param list links: A list of :class:`Request <frontera.core.models.Request>` objects generated from \
52
+ the links extracted for the crawled page.
53
+ """
54
+ return {}
55
+
56
+ @abstractmethod
57
+ def page_error (self , request , error ):
58
+ """
59
+ Called every time there was error during page downloading.
60
+
61
+ :param object request: The fetched with error :class:`Request <frontera.core.models.Request>` object.
62
+ :param str error: A string identifier for the error.
63
+ """
64
+ return {}
65
+
66
+ def finished (self ):
67
+ """
68
+ Called by Strategy worker, after finishing processing each cycle of spider log. If this method returns true,
69
+ then Strategy worker reports that crawling goal is achieved, stops and exits.
70
+
71
+ :return: bool
72
+ """
73
+ return False
74
+
75
+ def close (self ):
76
+ """
77
+ Called when strategy worker is about to close crawling strategy.
78
+ """
79
+ self ._mb_stream .flush ()
80
+ self ._states_context .release ()
81
+
82
+ def schedule (self , request , score = 1.0 , dont_queue = False ):
83
+ """
84
+ Schedule document for crawling with specified score.
85
+
86
+ :param request: A :class:`Request <frontera.core.models.Request>` object.
87
+ :param score: float from 0.0 to 1.0
88
+ :param dont_queue: bool, True - if no need to schedule, only update the score
89
+ """
90
+ self ._mb_stream .send (request .url , request .meta ['fingerprint' ], score , dont_queue )
91
+
92
+ def create_request (self , url , method = 'GET' , headers = None , cookies = None , meta = None , body = '' ):
93
+ """
94
+ Creates request with specified fields, with state fetched from backend.
95
+
96
+ :param url: str
97
+ :param method: str
98
+ :param headers: dict
99
+ :param cookies: dict
100
+ :param meta: dict
101
+ :param body: str
102
+ :return: :class:`Request <frontera.core.models.Request>`
103
+ """
104
+ r = Request (url , method = method , headers = headers , cookies = cookies , meta = meta , body = body )
105
+ self .url_mw ._add_fingerprint (r )
106
+ self ._states_context .refresh_and_keep (r )
107
+ return r
0 commit comments