diff --git a/h11/_connection.py b/h11/_connection.py index 410c4e9..c4baa80 100644 --- a/h11/_connection.py +++ b/h11/_connection.py @@ -425,7 +425,6 @@ def next_event(self): event = self._extract_next_receive_event() if event not in [NEED_DATA, PAUSED]: self._process_event(self.their_role, event) - self._receive_buffer.compress() if event is NEED_DATA: if len(self._receive_buffer) > self._max_incomplete_event_size: # 431 is "Request header fields too large" which is pretty diff --git a/h11/_readers.py b/h11/_readers.py index cc86bff..bafc00a 100644 --- a/h11/_readers.py +++ b/h11/_readers.py @@ -153,7 +153,7 @@ def __call__(self, buf): assert self._bytes_to_discard == 0 if self._bytes_in_chunk == 0: # We need to refill our chunk count - chunk_header = buf.maybe_extract_until_next(b"\r\n") + chunk_header = buf.maybe_extract_next_line() if chunk_header is None: return None matches = validate( diff --git a/h11/_receivebuffer.py b/h11/_receivebuffer.py index c56749a..b32f20e 100644 --- a/h11/_receivebuffer.py +++ b/h11/_receivebuffer.py @@ -1,10 +1,12 @@ +import re import sys __all__ = ["ReceiveBuffer"] # Operations we want to support: -# - find next \r\n or \r\n\r\n, or wait until there is one +# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable), +# or wait until there is one # - read at-most-N bytes # Goals: # - on average, do this fast @@ -38,75 +40,101 @@ # slightly clever thing where we delay calling compress() until we've # processed a whole event, which could in theory be slightly more efficient # than the internal bytearray support.) + +blank_line_regex = re.compile(b"\n\r?\n", re.MULTILINE) + + class ReceiveBuffer(object): def __init__(self): self._data = bytearray() - # These are both absolute offsets into self._data: - self._start = 0 - self._looked_at = 0 - self._looked_for = b"" + self._next_line_search = 0 + self._multiple_lines_search = 0 + + def __iadd__(self, byteslike): + self._data += byteslike + return self def __bool__(self): return bool(len(self)) + def __len__(self): + return len(self._data) + # for @property unprocessed_data def __bytes__(self): - return bytes(self._data[self._start :]) + return bytes(self._data) if sys.version_info[0] < 3: # version specific: Python 2 __str__ = __bytes__ __nonzero__ = __bool__ - def __len__(self): - return len(self._data) - self._start + def _extract(self, count): + # extracting an initial slice of the data buffer and return it + out = self._data[:count] + del self._data[:count] - def compress(self): - # Heuristic: only compress if it lets us reduce size by a factor - # of 2 - if self._start > len(self._data) // 2: - del self._data[: self._start] - self._looked_at -= self._start - self._start -= self._start + self._next_line_search = 0 + self._multiple_lines_search = 0 - def __iadd__(self, byteslike): - self._data += byteslike - return self + return out def maybe_extract_at_most(self, count): - out = self._data[self._start : self._start + count] + """ + Extract a fixed number of bytes from the buffer. + """ + out = self._data[:count] if not out: return None - self._start += len(out) - return out - def maybe_extract_until_next(self, needle): - # Returns extracted bytes on success (advancing offset), or None on - # failure - if self._looked_for == needle: - search_start = max(self._start, self._looked_at - len(needle) + 1) - else: - search_start = self._start - offset = self._data.find(needle, search_start) - if offset == -1: - self._looked_at = len(self._data) - self._looked_for = needle + return self._extract(count) + + def maybe_extract_next_line(self): + """ + Extract the first line, if it is completed in the buffer. + """ + # Only search in buffer space that we've not already looked at. + search_start_index = max(0, self._next_line_search - 1) + partial_idx = self._data.find(b"\r\n", search_start_index) + + if partial_idx == -1: + self._next_line_search = len(self._data) return None - new_start = offset + len(needle) - out = self._data[self._start : new_start] - self._start = new_start - return out - # HTTP/1.1 has a number of constructs where you keep reading lines until - # you see a blank one. This does that, and then returns the lines. + # + 2 is to compensate len(b"\r\n") + idx = partial_idx + 2 + + return self._extract(idx) + def maybe_extract_lines(self): - if self._data[self._start : self._start + 2] == b"\r\n": - self._start += 2 + """ + Extract everything up to the first blank line, and return a list of lines. + """ + # Handle the case where we have an immediate empty line. + if self._data[:1] == b"\n": + self._extract(1) + return [] + + if self._data[:2] == b"\r\n": + self._extract(2) return [] - else: - data = self.maybe_extract_until_next(b"\r\n\r\n") - if data is None: - return None - lines = data.split(b"\r\n") - assert lines[-2] == lines[-1] == b"" - del lines[-2:] - return lines + + # Only search in buffer space that we've not already looked at. + match = blank_line_regex.search(self._data, self._multiple_lines_search) + if match is None: + self._multiple_lines_search = max(0, len(self._data) - 2) + return None + + # Truncate the buffer and return it. + idx = match.span(0)[-1] + out = self._extract(idx) + lines = out.split(b"\n") + + for line in lines: + if line.endswith(b"\r"): + del line[-1] + + assert lines[-2] == lines[-1] == b"" + + del lines[-2:] + + return lines diff --git a/h11/tests/test_io.py b/h11/tests/test_io.py index 0323b26..459a627 100644 --- a/h11/tests/test_io.py +++ b/h11/tests/test_io.py @@ -215,6 +215,43 @@ def test_readers_unusual(): ), ) + # Tolerate headers line endings (\r\n and \n) + # \n\r\b between headers and body + tr( + READERS[SERVER, SEND_RESPONSE], + b"HTTP/1.1 200 OK\r\nSomeHeader: val\n\r\n", + Response( + status_code=200, + headers=[("SomeHeader", "val")], + http_version="1.1", + reason="OK", + ), + ) + + # delimited only with \n + tr( + READERS[SERVER, SEND_RESPONSE], + b"HTTP/1.1 200 OK\nSomeHeader1: val1\nSomeHeader2: val2\n\n", + Response( + status_code=200, + headers=[("SomeHeader1", "val1"), ("SomeHeader2", "val2")], + http_version="1.1", + reason="OK", + ), + ) + + # mixed \r\n and \n + tr( + READERS[SERVER, SEND_RESPONSE], + b"HTTP/1.1 200 OK\r\nSomeHeader1: val1\nSomeHeader2: val2\n\r\n", + Response( + status_code=200, + headers=[("SomeHeader1", "val1"), ("SomeHeader2", "val2")], + http_version="1.1", + reason="OK", + ), + ) + # obsolete line folding tr( READERS[CLIENT, IDLE], diff --git a/h11/tests/test_receivebuffer.py b/h11/tests/test_receivebuffer.py index 2be220b..3a61f9d 100644 --- a/h11/tests/test_receivebuffer.py +++ b/h11/tests/test_receivebuffer.py @@ -1,3 +1,7 @@ +import re + +import pytest + from .._receivebuffer import ReceiveBuffer @@ -12,7 +16,6 @@ def test_receivebuffer(): assert len(b) == 3 assert bytes(b) == b"123" - b.compress() assert bytes(b) == b"123" assert b.maybe_extract_at_most(2) == b"12" @@ -20,7 +23,6 @@ def test_receivebuffer(): assert len(b) == 1 assert bytes(b) == b"3" - b.compress() assert bytes(b) == b"3" assert b.maybe_extract_at_most(10) == b"3" @@ -33,32 +35,35 @@ def test_receivebuffer(): # maybe_extract_until_next ################################################################ - b += b"12345a6789aa" + b += b"123\n456\r\n789\r\n" - assert b.maybe_extract_until_next(b"a") == b"12345a" - assert bytes(b) == b"6789aa" + assert b.maybe_extract_next_line() == b"123\n456\r\n" + assert bytes(b) == b"789\r\n" - assert b.maybe_extract_until_next(b"aaa") is None - assert bytes(b) == b"6789aa" + assert b.maybe_extract_next_line() == b"789\r\n" + assert bytes(b) == b"" - b += b"a12" - assert b.maybe_extract_until_next(b"aaa") == b"6789aaa" - assert bytes(b) == b"12" + b += b"12\r" + assert b.maybe_extract_next_line() is None + assert bytes(b) == b"12\r" - # check repeated searches for the same needle, triggering the - # pickup-where-we-left-off logic - b += b"345" - assert b.maybe_extract_until_next(b"aaa") is None + b += b"345\n\r" + assert b.maybe_extract_next_line() is None + assert bytes(b) == b"12\r345\n\r" - b += b"6789aaa123" - assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa" - assert bytes(b) == b"123" + # here we stopped at the middle of b"\r\n" delimiter + + b += b"\n6789aaa123\r\n" + assert b.maybe_extract_next_line() == b"12\r345\n\r\n" + assert b.maybe_extract_next_line() == b"6789aaa123\r\n" + assert b.maybe_extract_next_line() is None + assert bytes(b) == b"" ################################################################ # maybe_extract_lines ################################################################ - b += b"\r\na: b\r\nfoo:bar\r\n\r\ntrailing" + b += b"123\r\na: b\r\nfoo:bar\r\n\r\ntrailing" lines = b.maybe_extract_lines() assert lines == [b"123", b"a: b", b"foo:bar"] assert bytes(b) == b"trailing" @@ -76,3 +81,54 @@ def test_receivebuffer(): b += b"\r\ntrailing" assert b.maybe_extract_lines() == [] assert bytes(b) == b"trailing" + + +@pytest.mark.parametrize( + "data", + [ + pytest.param( + ( + b"HTTP/1.1 200 OK\r\n", + b"Content-type: text/plain\r\n", + b"Connection: close\r\n", + b"\r\n", + b"Some body", + ), + id="with_crlf_delimiter", + ), + pytest.param( + ( + b"HTTP/1.1 200 OK\n", + b"Content-type: text/plain\n", + b"Connection: close\n", + b"\n", + b"Some body", + ), + id="with_lf_only_delimiter", + ), + pytest.param( + ( + b"HTTP/1.1 200 OK\n", + b"Content-type: text/plain\r\n", + b"Connection: close\n", + b"\n", + b"Some body", + ), + id="with_mixed_crlf_and_lf", + ), + ], +) +def test_receivebuffer_for_invalid_delimiter(data): + b = ReceiveBuffer() + + for line in data: + b += line + + lines = b.maybe_extract_lines() + + assert lines == [ + b"HTTP/1.1 200 OK", + b"Content-type: text/plain", + b"Connection: close", + ] + assert bytes(b) == b"Some body" diff --git a/newsfragments/7.feature.rst b/newsfragments/7.feature.rst new file mode 100644 index 0000000..01d6784 --- /dev/null +++ b/newsfragments/7.feature.rst @@ -0,0 +1,3 @@ +Added support for servers with broken line endings. + +After this change h11 accepts both ``\r\n`` and ``\n`` as a headers delimiter.