3
3
"""
4
4
from __future__ import absolute_import
5
5
import json
6
+ import six
6
7
from base64 import b64decode , b64encode
7
8
from frontera .core .codec import BaseDecoder , BaseEncoder
8
- from w3lib .util import to_unicode , to_native_str
9
- from frontera .utils .misc import dict_to_unicode , dict_to_bytes
9
+ from w3lib .util import to_unicode , to_bytes
10
+
11
+
12
+ def _convert_and_save_type (obj ):
13
+ """
14
+ :param obj: dict object
15
+
16
+ The purpose of this method is to transform the given dict
17
+ into a form that would be able to serialize with JSONEncoder.
18
+ In order to implement this, this method converts all byte strings
19
+ inside a dict to unicode and saves their type for reverting to its
20
+ original state. The type and the value are stored as a tuple in the
21
+ following format: (original_type, converted value). All other objects
22
+ like dict, tuple, list are converted to the same format for the sake
23
+ of serialization and for the ease of reverting.
24
+ Refer `https://github.com/scrapinghub/frontera/pull/233#discussion_r97432868`
25
+ for the detailed explanation about the design.
26
+ """
27
+ if isinstance (obj , bytes ):
28
+ return 'bytes' , to_unicode (obj )
29
+ elif isinstance (obj , dict ):
30
+ return 'dict' , [(_convert_and_save_type (k ), _convert_and_save_type (v )) for k , v in six .iteritems (obj )]
31
+ elif isinstance (obj , (list , tuple )):
32
+ return type (obj ).__name__ , [_convert_and_save_type (item ) for item in obj ]
33
+ return 'other' , obj
34
+
35
+
36
+ def _convert_from_saved_type (obj ):
37
+ """
38
+ :param obj: object returned by `_convert_and_save_type`
39
+
40
+ Restores the original state of the object converted
41
+ earlier by `_convert_and_save_type`. This method considers every
42
+ first element of the nested tuple as the original type information and
43
+ the second value to be the converted value. It applies the original type
44
+ recursively on the object to retrieve the original form of the object.
45
+ """
46
+ assert len (obj ) == 2
47
+ obj_type , obj_value = obj
48
+ if obj_type == 'bytes' :
49
+ return to_bytes (obj_value )
50
+ elif obj_type == 'dict' :
51
+ return dict ([(_convert_from_saved_type (k ), _convert_from_saved_type (v )) for k , v in obj_value ])
52
+ elif obj_type in ['list' , 'tuple' ]:
53
+ _type = list if obj_type == 'list' else tuple
54
+ return _type ([_convert_from_saved_type (item ) for item in obj_value ])
55
+ return obj_value
10
56
11
57
12
58
def _prepare_request_message (request ):
13
- return {'url' : to_unicode ( request .url ) ,
14
- 'method' : to_unicode ( request .method ) ,
15
- 'headers' : dict_to_unicode ( request .headers ) ,
16
- 'cookies' : dict_to_unicode ( request .cookies ) ,
17
- 'meta' : dict_to_unicode ( request .meta ) }
59
+ return {'url' : request .url ,
60
+ 'method' : request .method ,
61
+ 'headers' : request .headers ,
62
+ 'cookies' : request .cookies ,
63
+ 'meta' : request .meta }
18
64
19
65
20
66
def _prepare_links_message (links ):
21
67
return [_prepare_request_message (link ) for link in links ]
22
68
23
69
24
70
def _prepare_response_message (response , send_body ):
25
- return {'url' : to_unicode ( response .url ) ,
71
+ return {'url' : response .url ,
26
72
'status_code' : response .status_code ,
27
- 'meta' : dict_to_unicode ( response .meta ) ,
28
- 'body' : to_unicode ( b64encode (response .body ) ) if send_body else None }
73
+ 'meta' : response .meta ,
74
+ 'body' : b64encode (response .body ) if send_body else None }
29
75
30
76
31
77
class CrawlFrontierJSONEncoder (json .JSONEncoder ):
@@ -45,6 +91,10 @@ def __init__(self, request_model, *a, **kw):
45
91
self .send_body = kw .pop ('send_body' , False )
46
92
super (Encoder , self ).__init__ (request_model , * a , ** kw )
47
93
94
+ def encode (self , obj ):
95
+ encoded = _convert_and_save_type (obj )
96
+ return super (Encoder , self ).encode (encoded )
97
+
48
98
def encode_add_seeds (self , seeds ):
49
99
return self .encode ({
50
100
'type' : 'add_seeds' ,
@@ -101,52 +151,51 @@ def __init__(self, request_model, response_model, *a, **kw):
101
151
super (Decoder , self ).__init__ (* a , ** kw )
102
152
103
153
def _response_from_object (self , obj ):
104
- url = to_native_str ( obj [b 'url' ])
154
+ url = obj ['url' ]
105
155
request = self ._request_model (url = url ,
106
- meta = obj [b 'meta' ])
156
+ meta = obj ['meta' ])
107
157
return self ._response_model (url = url ,
108
- status_code = obj [b 'status_code' ],
109
- body = b64decode (obj [b 'body' ]) if obj [b 'body' ] is not None else None ,
158
+ status_code = obj ['status_code' ],
159
+ body = b64decode (obj ['body' ]) if obj ['body' ] is not None else None ,
110
160
request = request )
111
161
112
162
def _request_from_object (self , obj ):
113
- return self ._request_model (url = to_native_str ( obj [b 'url' ]) ,
114
- method = obj [b 'method' ],
115
- headers = obj [b 'headers' ],
116
- cookies = obj [b 'cookies' ],
117
- meta = obj [b 'meta' ])
163
+ return self ._request_model (url = obj ['url' ],
164
+ method = obj ['method' ],
165
+ headers = obj ['headers' ],
166
+ cookies = obj ['cookies' ],
167
+ meta = obj ['meta' ])
118
168
119
169
def decode (self , message ):
120
- message = dict_to_bytes (super (Decoder , self ).decode (message ))
121
- if message [b 'type' ] == b 'links_extracted' :
122
- request = self ._request_from_object (message [b 'r' ])
123
- links = [self ._request_from_object (link ) for link in message [b 'links' ]]
170
+ message = _convert_from_saved_type (super (Decoder , self ).decode (message ))
171
+ if message ['type' ] == 'links_extracted' :
172
+ request = self ._request_from_object (message ['r' ])
173
+ links = [self ._request_from_object (link ) for link in message ['links' ]]
124
174
return ('links_extracted' , request , links )
125
- if message [b 'type' ] == b 'page_crawled' :
126
- response = self ._response_from_object (message [b 'r' ])
175
+ if message ['type' ] == 'page_crawled' :
176
+ response = self ._response_from_object (message ['r' ])
127
177
return ('page_crawled' , response )
128
- if message [b 'type' ] == b 'request_error' :
129
- request = self ._request_from_object (message [b 'r' ])
130
- return ('request_error' , request , to_native_str ( message [b 'error' ]) )
131
- if message [b 'type' ] == b 'update_score' :
132
- return ('update_score' , self ._request_from_object (message [b 'r' ]), message [b 'score' ], message [b 'schedule' ])
133
- if message [b 'type' ] == b 'add_seeds' :
178
+ if message ['type' ] == 'request_error' :
179
+ request = self ._request_from_object (message ['r' ])
180
+ return ('request_error' , request , message ['error' ])
181
+ if message ['type' ] == 'update_score' :
182
+ return ('update_score' , self ._request_from_object (message ['r' ]), message ['score' ], message ['schedule' ])
183
+ if message ['type' ] == 'add_seeds' :
134
184
seeds = []
135
- for seed in message [b 'seeds' ]:
185
+ for seed in message ['seeds' ]:
136
186
request = self ._request_from_object (seed )
137
187
seeds .append (request )
138
188
return ('add_seeds' , seeds )
139
- if message [b 'type' ] == b 'new_job_id' :
140
- return ('new_job_id' , int (message [b 'job_id' ]))
141
- if message [b 'type' ] == b 'offset' :
142
- return ('offset' , int (message [b 'partition_id' ]), int (message [b 'offset' ]))
189
+ if message ['type' ] == 'new_job_id' :
190
+ return ('new_job_id' , int (message ['job_id' ]))
191
+ if message ['type' ] == 'offset' :
192
+ return ('offset' , int (message ['partition_id' ]), int (message ['offset' ]))
143
193
return TypeError ('Unknown message type' )
144
194
145
195
def decode_request (self , message ):
146
- obj = dict_to_bytes (super (Decoder , self ).decode (message ))
147
- return self ._request_model (url = to_native_str (obj [b'url' ]),
148
- method = obj [b'method' ],
149
- headers = obj [b'headers' ],
150
- cookies = obj [b'cookies' ],
151
- meta = obj [b'meta' ])
152
-
196
+ obj = _convert_from_saved_type (super (Decoder , self ).decode (message ))
197
+ return self ._request_model (url = obj ['url' ],
198
+ method = obj ['method' ],
199
+ headers = obj ['headers' ],
200
+ cookies = obj ['cookies' ],
201
+ meta = obj ['meta' ])
0 commit comments