From 1e9d57f80ca883881804292448fff4de8b112733 Mon Sep 17 00:00:00 2001 From: Levi B Date: Tue, 17 Mar 2015 16:50:02 -0700 Subject: [PATCH] UrlEncoder should always encode the U+003A COLON character Provides extra defense-in-depth in case an application is using this API to encode a relative URL, otherwise the part before the colon could inadvertently be treated as a scheme. --- .../UrlEncoder.cs | 26 +++++++++++++++---- .../UrlEncoderTests.cs | 3 +-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs b/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs index c29fa70b..d3473453 100644 --- a/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs +++ b/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs @@ -131,12 +131,24 @@ internal UrlUnicodeEncoder(CodePointFilter filter) : base(filter, MaxOutputCharsPerInputChar) { // Per RFC 3987, Sec. 2.2, we want encodings that are safe for - // 'isegment', 'iquery', and 'ifragment'. The only thing these - // all have in common is 'ipchar', which is defined as such: + // four particular components: 'isegment', 'ipath-noscheme', + // 'iquery', and 'ifragment'. The relevant definitions are below. + // + // ipath-noscheme = isegment-nz-nc *( "/" isegment ) + // + // isegment = *ipchar + // + // isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims + // / "@" ) + // ; non-zero-length segment without any colon ":" // // ipchar = iunreserved / pct-encoded / sub-delims / ":" // / "@" // + // iquery = *( ipchar / iprivate / "/" / "?" ) + // + // ifragment = *( ipchar / "/" / "?" ) + // // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar // // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF @@ -151,15 +163,19 @@ internal UrlUnicodeEncoder(CodePointFilter filter) // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" // / "*" / "+" / "," / ";" / "=" // - // From this list, the base encoder forbids "&", "'", "+", + // The only common characters between these four components are the + // intersection of 'isegment-nz-nc' and 'ipchar', which is really + // just 'isegment-nz-nc' (colons forbidden). + // + // From this list, the base encoder already forbids "&", "'", "+", // and we'll additionally forbid "=" since it has special meaning // in x-www-form-urlencoded representations. // // This means that the full list of allowed characters from the // Basic Latin set is: - // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / "@" + // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@" - const string forbiddenChars = @" #%/=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder + const string forbiddenChars = @" #%/:=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder foreach (char c in forbiddenChars) { ForbidCharacter(c); diff --git a/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs b/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs index 2141bba7..0d37e4f9 100644 --- a/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs +++ b/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs @@ -123,8 +123,7 @@ public void UrlEncode_AllRangesAllowed_StillEncodesForbiddenChars() case '_': case '~': - // ipchar - case ':': + // isegment-nz-nc case '@': // sub-delims