70
70
# To be used with str.strip() and related methods.
71
71
HTTP_WHITESPACE = " \t "
72
72
73
+ # Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
74
+ # chars greater than \xFF (which may appear after decoding utf8).
75
+ _FORBIDDEN_HEADER_CHARS_RE = re .compile (r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]" )
76
+
73
77
74
78
class _ABNF :
75
79
"""Class that holds a subset of ABNF rules from RFC 9110 and friends.
@@ -196,14 +200,18 @@ def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811
196
200
197
201
# new public methods
198
202
199
- def add (self , name : str , value : str ) -> None :
203
+ def add (self , name : str , value : str , * , _chars_are_bytes : bool = True ) -> None :
200
204
"""Adds a new value for the given key."""
201
205
if not _ABNF .field_name .fullmatch (name ):
202
206
raise HTTPInputError ("Invalid header name %r" % name )
203
- if not _ABNF .field_value .fullmatch (to_unicode (value )):
204
- # TODO: the fact we still support bytes here (contrary to type annotations)
205
- # and still test for it should probably be changed.
206
- raise HTTPInputError ("Invalid header value %r" % value )
207
+ if _chars_are_bytes :
208
+ if not _ABNF .field_value .fullmatch (to_unicode (value )):
209
+ # TODO: the fact we still support bytes here (contrary to type annotations)
210
+ # and still test for it should probably be changed.
211
+ raise HTTPInputError ("Invalid header value %r" % value )
212
+ else :
213
+ if _FORBIDDEN_HEADER_CHARS_RE .search (value ):
214
+ raise HTTPInputError ("Invalid header value %r" % value )
207
215
norm_name = _normalize_header (name )
208
216
self ._last_key = norm_name
209
217
if norm_name in self :
@@ -229,7 +237,7 @@ def get_all(self) -> Iterable[Tuple[str, str]]:
229
237
for value in values :
230
238
yield (name , value )
231
239
232
- def parse_line (self , line : str ) -> None :
240
+ def parse_line (self , line : str , * , _chars_are_bytes : bool = True ) -> None :
233
241
r"""Updates the dictionary with a single header line.
234
242
235
243
>>> h = HTTPHeaders()
@@ -263,19 +271,25 @@ def parse_line(self, line: str) -> None:
263
271
if self ._last_key is None :
264
272
raise HTTPInputError ("first header line cannot start with whitespace" )
265
273
new_part = " " + line .strip (HTTP_WHITESPACE )
266
- if not _ABNF .field_value .fullmatch (new_part [1 :]):
267
- raise HTTPInputError ("Invalid header continuation %r" % new_part )
274
+ if _chars_are_bytes :
275
+ if not _ABNF .field_value .fullmatch (new_part [1 :]):
276
+ raise HTTPInputError ("Invalid header continuation %r" % new_part )
277
+ else :
278
+ if _FORBIDDEN_HEADER_CHARS_RE .search (new_part ):
279
+ raise HTTPInputError ("Invalid header value %r" % new_part )
268
280
self ._as_list [self ._last_key ][- 1 ] += new_part
269
281
self ._dict [self ._last_key ] += new_part
270
282
else :
271
283
try :
272
284
name , value = line .split (":" , 1 )
273
285
except ValueError :
274
286
raise HTTPInputError ("no colon in header line" )
275
- self .add (name , value .strip (HTTP_WHITESPACE ))
287
+ self .add (
288
+ name , value .strip (HTTP_WHITESPACE ), _chars_are_bytes = _chars_are_bytes
289
+ )
276
290
277
291
@classmethod
278
- def parse (cls , headers : str ) -> "HTTPHeaders" :
292
+ def parse (cls , headers : str , * , _chars_are_bytes : bool = True ) -> "HTTPHeaders" :
279
293
"""Returns a dictionary from HTTP header text.
280
294
281
295
>>> h = HTTPHeaders.parse("Content-Type: text/html\\ r\\ nContent-Length: 42\\ r\\ n")
@@ -288,17 +302,31 @@ def parse(cls, headers: str) -> "HTTPHeaders":
288
302
mix of `KeyError`, and `ValueError`.
289
303
290
304
"""
305
+ # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
306
+ # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
307
+ # they are to be interpreted as utf-8). For historical reasons, this method handled this by
308
+ # expecting both callers to decode the headers to strings before parsing them. This wasn't a
309
+ # problem until we started doing stricter validation of the characters allowed in HTTP
310
+ # headers (using ABNF rules defined in terms of byte values), which inadvertently started
311
+ # disallowing non-latin1 characters in multipart/form-data filenames.
312
+ #
313
+ # This method should have accepted bytes and a desired encoding, but this change is being
314
+ # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
315
+ # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
316
+ # smuggled through the latin1 encoding) or to accept any non-control unicode characters
317
+ # as required by multipart/form-data. This method will change to accept bytes in a future
318
+ # release.
291
319
h = cls ()
292
320
293
321
start = 0
294
322
while True :
295
323
lf = headers .find ("\n " , start )
296
324
if lf == - 1 :
297
- h .parse_line (headers [start :])
325
+ h .parse_line (headers [start :], _chars_are_bytes = _chars_are_bytes )
298
326
break
299
327
line = headers [start : lf + 1 ]
300
328
start = lf + 1
301
- h .parse_line (line )
329
+ h .parse_line (line , _chars_are_bytes = _chars_are_bytes )
302
330
return h
303
331
304
332
# MutableMapping abstract method implementations.
@@ -946,7 +974,7 @@ def parse_multipart_form_data(
946
974
eoh = part .find (b"\r \n \r \n " )
947
975
if eoh == - 1 :
948
976
raise HTTPInputError ("multipart/form-data missing headers" )
949
- headers = HTTPHeaders .parse (part [:eoh ].decode ("utf-8" ))
977
+ headers = HTTPHeaders .parse (part [:eoh ].decode ("utf-8" ), _chars_are_bytes = False )
950
978
disp_header = headers .get ("Content-Disposition" , "" )
951
979
disposition , disp_params = _parse_header (disp_header )
952
980
if disposition != "form-data" or not part .endswith (b"\r \n " ):
0 commit comments