Skip to content

Commit b5586f3

Browse files
authored
Merge pull request #3503 from bdarnell/multipart-utf8
httputil: Fix support for non-latin1 filenames in multipart uploads
2 parents ab5f354 + 62c2764 commit b5586f3

File tree

5 files changed

+73
-16
lines changed

5 files changed

+73
-16
lines changed

docs/releases.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Release notes
44
.. toctree::
55
:maxdepth: 2
66

7+
releases/v6.5.1
78
releases/v6.5.0
89
releases/v6.4.2
910
releases/v6.4.1

docs/releases/v6.5.1.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
What's new in Tornado 6.5.1
2+
===========================
3+
4+
May 22, 2025
5+
------------
6+
7+
Bug fixes
8+
~~~~~~~~~
9+
10+
- Fixed a bug in ``multipart/form-data`` parsing that could incorrectly reject filenames containing
11+
characters above U+00FF (i.e. most characters outside the Latin alphabet).

tornado/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
# is zero for an official release, positive for a development branch,
2323
# or negative for a release candidate or beta (after the base version
2424
# number has been incremented)
25-
version = "6.5"
26-
version_info = (6, 5, 0, 0)
25+
version = "6.5.1"
26+
version_info = (6, 5, 1, 0)
2727

2828
import importlib
2929
import typing

tornado/httputil.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@
7070
# To be used with str.strip() and related methods.
7171
HTTP_WHITESPACE = " \t"
7272

73+
# Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
74+
# chars greater than \xFF (which may appear after decoding utf8).
75+
_FORBIDDEN_HEADER_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
76+
7377

7478
class _ABNF:
7579
"""Class that holds a subset of ABNF rules from RFC 9110 and friends.
@@ -196,14 +200,18 @@ def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811
196200

197201
# new public methods
198202

199-
def add(self, name: str, value: str) -> None:
203+
def add(self, name: str, value: str, *, _chars_are_bytes: bool = True) -> None:
200204
"""Adds a new value for the given key."""
201205
if not _ABNF.field_name.fullmatch(name):
202206
raise HTTPInputError("Invalid header name %r" % name)
203-
if not _ABNF.field_value.fullmatch(to_unicode(value)):
204-
# TODO: the fact we still support bytes here (contrary to type annotations)
205-
# and still test for it should probably be changed.
206-
raise HTTPInputError("Invalid header value %r" % value)
207+
if _chars_are_bytes:
208+
if not _ABNF.field_value.fullmatch(to_unicode(value)):
209+
# TODO: the fact we still support bytes here (contrary to type annotations)
210+
# and still test for it should probably be changed.
211+
raise HTTPInputError("Invalid header value %r" % value)
212+
else:
213+
if _FORBIDDEN_HEADER_CHARS_RE.search(value):
214+
raise HTTPInputError("Invalid header value %r" % value)
207215
norm_name = _normalize_header(name)
208216
self._last_key = norm_name
209217
if norm_name in self:
@@ -229,7 +237,7 @@ def get_all(self) -> Iterable[Tuple[str, str]]:
229237
for value in values:
230238
yield (name, value)
231239

232-
def parse_line(self, line: str) -> None:
240+
def parse_line(self, line: str, *, _chars_are_bytes: bool = True) -> None:
233241
r"""Updates the dictionary with a single header line.
234242
235243
>>> h = HTTPHeaders()
@@ -263,19 +271,25 @@ def parse_line(self, line: str) -> None:
263271
if self._last_key is None:
264272
raise HTTPInputError("first header line cannot start with whitespace")
265273
new_part = " " + line.strip(HTTP_WHITESPACE)
266-
if not _ABNF.field_value.fullmatch(new_part[1:]):
267-
raise HTTPInputError("Invalid header continuation %r" % new_part)
274+
if _chars_are_bytes:
275+
if not _ABNF.field_value.fullmatch(new_part[1:]):
276+
raise HTTPInputError("Invalid header continuation %r" % new_part)
277+
else:
278+
if _FORBIDDEN_HEADER_CHARS_RE.search(new_part):
279+
raise HTTPInputError("Invalid header value %r" % new_part)
268280
self._as_list[self._last_key][-1] += new_part
269281
self._dict[self._last_key] += new_part
270282
else:
271283
try:
272284
name, value = line.split(":", 1)
273285
except ValueError:
274286
raise HTTPInputError("no colon in header line")
275-
self.add(name, value.strip(HTTP_WHITESPACE))
287+
self.add(
288+
name, value.strip(HTTP_WHITESPACE), _chars_are_bytes=_chars_are_bytes
289+
)
276290

277291
@classmethod
278-
def parse(cls, headers: str) -> "HTTPHeaders":
292+
def parse(cls, headers: str, *, _chars_are_bytes: bool = True) -> "HTTPHeaders":
279293
"""Returns a dictionary from HTTP header text.
280294
281295
>>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
@@ -288,17 +302,31 @@ def parse(cls, headers: str) -> "HTTPHeaders":
288302
mix of `KeyError`, and `ValueError`.
289303
290304
"""
305+
# _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
306+
# non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
307+
# they are to be interpreted as utf-8). For historical reasons, this method handled this by
308+
# expecting both callers to decode the headers to strings before parsing them. This wasn't a
309+
# problem until we started doing stricter validation of the characters allowed in HTTP
310+
# headers (using ABNF rules defined in terms of byte values), which inadvertently started
311+
# disallowing non-latin1 characters in multipart/form-data filenames.
312+
#
313+
# This method should have accepted bytes and a desired encoding, but this change is being
314+
# introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
315+
# flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
316+
# smuggled through the latin1 encoding) or to accept any non-control unicode characters
317+
# as required by multipart/form-data. This method will change to accept bytes in a future
318+
# release.
291319
h = cls()
292320

293321
start = 0
294322
while True:
295323
lf = headers.find("\n", start)
296324
if lf == -1:
297-
h.parse_line(headers[start:])
325+
h.parse_line(headers[start:], _chars_are_bytes=_chars_are_bytes)
298326
break
299327
line = headers[start : lf + 1]
300328
start = lf + 1
301-
h.parse_line(line)
329+
h.parse_line(line, _chars_are_bytes=_chars_are_bytes)
302330
return h
303331

304332
# MutableMapping abstract method implementations.
@@ -946,7 +974,7 @@ def parse_multipart_form_data(
946974
eoh = part.find(b"\r\n\r\n")
947975
if eoh == -1:
948976
raise HTTPInputError("multipart/form-data missing headers")
949-
headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
977+
headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"), _chars_are_bytes=False)
950978
disp_header = headers.get("Content-Disposition", "")
951979
disposition, disp_params = _parse_header(disp_header)
952980
if disposition != "form-data" or not part.endswith(b"\r\n"):

tornado/test/httputil_test.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def test_special_filenames(self):
155155
self.assertEqual(file["filename"], filename)
156156
self.assertEqual(file["body"], b"Foo")
157157

158-
def test_non_ascii_filename(self):
158+
def test_non_ascii_filename_rfc5987(self):
159159
data = b"""\
160160
--1234
161161
Content-Disposition: form-data; name="files"; filename="ab.txt"; filename*=UTF-8''%C3%A1b.txt
@@ -170,6 +170,23 @@ def test_non_ascii_filename(self):
170170
self.assertEqual(file["filename"], "áb.txt")
171171
self.assertEqual(file["body"], b"Foo")
172172

173+
def test_non_ascii_filename_raw(self):
174+
data = """\
175+
--1234
176+
Content-Disposition: form-data; name="files"; filename="测试.txt"
177+
178+
Foo
179+
--1234--""".encode(
180+
"utf-8"
181+
).replace(
182+
b"\n", b"\r\n"
183+
)
184+
args, files = form_data_args()
185+
parse_multipart_form_data(b"1234", data, args, files)
186+
file = files["files"][0]
187+
self.assertEqual(file["filename"], "测试.txt")
188+
self.assertEqual(file["body"], b"Foo")
189+
173190
def test_boundary_starts_and_ends_with_quotes(self):
174191
data = b"""\
175192
--1234

0 commit comments

Comments
 (0)