Merge pull request #3387 from bdarnell/chunked-parsing

http1connection: Stricter handling of transfer-encoding and whitespace
tornadoweb · Jun 6, 2024 · d65f6e7 · d65f6e7
2 parents 7786f09 + 8d721a8
commit d65f6e7
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 22 deletions.
diff --git a/tornado/http1connection.py b/tornado/http1connection.py
@@ -391,14 +391,11 @@ def write_headers(
  self._request_start_line = start_line
  lines.append(utf8("%s %s HTTP/1.1" % (start_line[0], start_line[1])))
  # Client requests with a non-empty body must have either a
- # Content-Length or a Transfer-Encoding.
+ # Content-Length or a Transfer-Encoding. If Content-Length is not
+ # present we'll add our Transfer-Encoding below.
  self._chunking_output = (
  start_line.method in ("POST", "PUT", "PATCH")
  and "Content-Length" not in headers
- and (
- "Transfer-Encoding" not in headers
- or headers["Transfer-Encoding"] == "chunked"
- )
  )
  else:
  assert isinstance(start_line, httputil.ResponseStartLine)
@@ -420,9 +417,6 @@ def write_headers(
  and (start_line.code < 100 or start_line.code >= 200)
  # No need to chunk the output if a Content-Length is specified.
  and "Content-Length" not in headers
- # Applications are discouraged from touching Transfer-Encoding,
- # but if they do, leave it alone.
- and "Transfer-Encoding" not in headers
  )
  # If connection to a 1.1 client will be closed, inform client
  if (
@@ -562,7 +556,7 @@ def _can_keep_alive(
  return connection_header != "close"
  elif (
  "Content-Length" in headers
- or headers.get("Transfer-Encoding", "").lower() == "chunked"
+ or is_transfer_encoding_chunked(headers)
  or getattr(start_line, "method", None) in ("HEAD", "GET")
  ):
  # start_line may be a request or response start line; only
@@ -600,13 +594,6 @@ def _read_body(
  delegate: httputil.HTTPMessageDelegate,
  ) -> Optional[Awaitable[None]]:
  if "Content-Length" in headers:
- if "Transfer-Encoding" in headers:
- # Response cannot contain both Content-Length and
- # Transfer-Encoding headers.
- # http:https://tools.ietf.org/html/rfc7230#section-3.3.3
- raise httputil.HTTPInputError(
- "Response with both Transfer-Encoding and Content-Length"
- )
  if "," in headers["Content-Length"]:
  # Proxies sometimes cause Content-Length headers to get
  # duplicated. If all the values are identical then we can
@@ -633,20 +620,22 @@ def _read_body(
  else:
  content_length = None
 
+ is_chunked = is_transfer_encoding_chunked(headers)
+
  if code == 204:
  # This response code is not allowed to have a non-empty body,
  # and has an implicit length of zero instead of read-until-close.
  # http:https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.3
- if "Transfer-Encoding" in headers or content_length not in (None, 0):
+ if is_chunked or content_length not in (None, 0):
  raise httputil.HTTPInputError(
  "Response with code %d should not have body" % code
  )
  content_length = 0
 
+ if is_chunked:
+ return self._read_chunked_body(delegate)
  if content_length is not None:
  return self._read_fixed_body(content_length, delegate)
- if headers.get("Transfer-Encoding", "").lower() == "chunked":
- return self._read_chunked_body(delegate)
  if self.is_client:
  return self._read_body_until_close(delegate)
  return None
@@ -865,3 +854,33 @@ def parse_hex_int(s: str) -> int:
  if HEXDIGITS.fullmatch(s) is None:
  raise ValueError("not a hexadecimal integer: %r" % s)
  return int(s, 16)
+
+
+def is_transfer_encoding_chunked(headers: httputil.HTTPHeaders) -> bool:
+ """Returns true if the headers specify Transfer-Encoding: chunked.
+
+ Raise httputil.HTTPInputError if any other transfer encoding is used.
+ """
+ # Note that transfer-encoding is an area in which postel's law can lead
+ # us astray. If a proxy and a backend server are liberal in what they accept,
+ # but accept slightly different things, this can lead to mismatched framing
+ # and request smuggling issues. Therefore we are as strict as possible here
+ # (even technically going beyond the requirements of the RFCs: a value of
+ # ",chunked" is legal but doesn't appear in practice for legitimate traffic)
+ if "Transfer-Encoding" not in headers:
+ return False
+ if "Content-Length" in headers:
+ # Message cannot contain both Content-Length and
+ # Transfer-Encoding headers.
+ # http:https://tools.ietf.org/html/rfc7230#section-3.3.3
+ raise httputil.HTTPInputError(
+ "Message with both Transfer-Encoding and Content-Length"
+ )
+ if headers["Transfer-Encoding"].lower() == "chunked":
+ return True
+ # We do not support any transfer-encodings other than chunked, and we do not
+ # expect to add any support because the concept of transfer-encoding has
+ # been removed in HTTP/2.
+ raise httputil.HTTPInputError(
+ "Unsupported Transfer-Encoding %s" % headers["Transfer-Encoding"]
+ )
diff --git a/tornado/httputil.py b/tornado/httputil.py
@@ -62,6 +62,9 @@
  from asyncio import Future # noqa: F401
  import unittest # noqa: F401
 
+# To be used with str.strip() and related methods.
+HTTP_WHITESPACE = " \t"
+
 
 @lru_cache(1000)
 def _normalize_header(name: str) -> str:
@@ -171,15 +174,15 @@ def parse_line(self, line: str) -> None:
  # continuation of a multi-line header
  if self._last_key is None:
  raise HTTPInputError("first header line cannot start with whitespace")
- new_part = " " + line.lstrip()
+ new_part = " " + line.lstrip(HTTP_WHITESPACE)
  self._as_list[self._last_key][-1] += new_part
  self._dict[self._last_key] += new_part
  else:
  try:
  name, value = line.split(":", 1)
  except ValueError:
  raise HTTPInputError("no colon in header line")
- self.add(name, value.strip())
+ self.add(name, value.strip(HTTP_WHITESPACE))
 
  @classmethod
  def parse(cls, headers: str) -> "HTTPHeaders":

diff --git a/tornado/test/httpserver_test.py b/tornado/test/httpserver_test.py
@@ -581,6 +581,76 @@ def test_chunked_request_body_invalid_size(self):
  )
  self.assertEqual(400, start_line.code)
 
+ def test_chunked_request_body_duplicate_header(self):
+ # Repeated Transfer-Encoding headers should be an error (and not confuse
+ # the chunked-encoding detection to mess up framing).
+ self.stream.write(
+ b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Transfer-encoding: chunked
+
+2
+ok
+0
+
+"""
+ )
+ with ExpectLog(
+ gen_log,
+ ".*Unsupported Transfer-Encoding chunked,chunked",
+ level=logging.INFO,
+ ):
+ start_line, headers, response = self.io_loop.run_sync(
+ lambda: read_stream_body(self.stream)
+ )
+ self.assertEqual(400, start_line.code)
+
+ def test_chunked_request_body_unsupported_transfer_encoding(self):
+ # We don't support transfer-encodings other than chunked.
+ self.stream.write(
+ b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: gzip, chunked
+
+2
+ok
+0
+
+"""
+ )
+ with ExpectLog(
+ gen_log, ".*Unsupported Transfer-Encoding gzip, chunked", level=logging.INFO
+ ):
+ start_line, headers, response = self.io_loop.run_sync(
+ lambda: read_stream_body(self.stream)
+ )
+ self.assertEqual(400, start_line.code)
+
+ def test_chunked_request_body_transfer_encoding_and_content_length(self):
+ # Transfer-encoding and content-length are mutually exclusive
+ self.stream.write(
+ b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Content-Length: 2
+
+2
+ok
+0
+
+"""
+ )
+ with ExpectLog(
+ gen_log,
+ ".*Message with both Transfer-Encoding and Content-Length",
+ level=logging.INFO,
+ ):
+ start_line, headers, response = self.io_loop.run_sync(
+ lambda: read_stream_body(self.stream)
+ )
+ self.assertEqual(400, start_line.code)
+
  @gen_test
  def test_invalid_content_length(self):
  # HTTP only allows decimal digits in content-length. Make sure we don't

diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
@@ -334,6 +334,25 @@ def test_unicode_newlines(self):
  gen_log.warning("failed while trying %r in %s", newline, encoding)
  raise
 
+ def test_unicode_whitespace(self):
+ # Only tabs and spaces are to be stripped according to the HTTP standard.
+ # Other unicode whitespace is to be left as-is. In the context of headers,
+ # this specifically means the whitespace characters falling within the
+ # latin1 charset.
+ whitespace = [
+ (" ", True), # SPACE
+ ("\t", True), # TAB
+ ("\u00a0", False), # NON-BREAKING SPACE
+ ("\u0085", False), # NEXT LINE
+ ]
+ for c, stripped in whitespace:
+ headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c)
+ if stripped:
+ expected = [("Transfer-Encoding", "chunked")]
+ else:
+ expected = [("Transfer-Encoding", "%schunked" % c)]
+ self.assertEqual(expected, list(headers.get_all()))
+
  def test_optional_cr(self):
  # Both CRLF and LF should be accepted as separators. CR should not be
  # part of the data when followed by LF, but it is a normal char

diff --git a/tornado/test/simple_httpclient_test.py b/tornado/test/simple_httpclient_test.py
@@ -828,7 +828,7 @@ def test_chunked_with_content_length(self):
  with ExpectLog(
  gen_log,
  (
- "Malformed HTTP message from None: Response "
+ "Malformed HTTP message from None: Message "
  "with both Transfer-Encoding and Content-Length"
  ),
  level=logging.INFO,