From 1e9d57f80ca883881804292448fff4de8b112733 Mon Sep 17 00:00:00 2001
From: Levi B <levib@yahoo.com>
Date: Tue, 17 Mar 2015 16:50:02 -0700
Subject: [PATCH] UrlEncoder should always encode the U+003A COLON character
 Provides extra defense-in-depth in case an application is using this API to
 encode a relative URL, otherwise the part before the colon could
 inadvertently be treated as a scheme.

---
 .../UrlEncoder.cs                             | 26 +++++++++++++++----
 .../UrlEncoderTests.cs                        |  3 +--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs b/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs
index c29fa70b..d3473453 100644
--- a/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs
+++ b/src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs
@@ -131,12 +131,24 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
                 : base(filter, MaxOutputCharsPerInputChar)
             {
                 // Per RFC 3987, Sec. 2.2, we want encodings that are safe for
-                // 'isegment', 'iquery', and 'ifragment'. The only thing these
-                // all have in common is 'ipchar', which is defined as such:
+                // four particular components: 'isegment', 'ipath-noscheme',
+                // 'iquery', and 'ifragment'. The relevant definitions are below.
+                //
+                //    ipath-noscheme = isegment-nz-nc *( "/" isegment )
+                // 
+                //    isegment       = *ipchar
+                // 
+                //    isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
+                //                         / "@" )
+                //                   ; non-zero-length segment without any colon ":"
                 //
                 //    ipchar         = iunreserved / pct-encoded / sub-delims / ":"
                 //                   / "@"
                 // 
+                //    iquery         = *( ipchar / iprivate / "/" / "?" )
+                // 
+                //    ifragment      = *( ipchar / "/" / "?" )
+                // 
                 //    iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
                 // 
                 //    ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
@@ -151,15 +163,19 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
                 //    sub-delims     = "!" / "$" / "&" / "'" / "(" / ")"
                 //                   / "*" / "+" / "," / ";" / "="
                 //
-                // From this list, the base encoder forbids "&", "'", "+",
+                // The only common characters between these four components are the
+                // intersection of 'isegment-nz-nc' and 'ipchar', which is really
+                // just 'isegment-nz-nc' (colons forbidden).
+                // 
+                // From this list, the base encoder already forbids "&", "'", "+",
                 // and we'll additionally forbid "=" since it has special meaning
                 // in x-www-form-urlencoded representations.
                 //
                 // This means that the full list of allowed characters from the
                 // Basic Latin set is:
-                // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / "@"
+                // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@"
 
-                const string forbiddenChars = @" #%/=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
+                const string forbiddenChars = @" #%/:=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
                 foreach (char c in forbiddenChars)
                 {
                     ForbidCharacter(c);
diff --git a/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs b/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs
index 2141bba7..0d37e4f9 100644
--- a/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs
+++ b/test/Microsoft.Framework.WebEncoders.Tests/UrlEncoderTests.cs
@@ -123,8 +123,7 @@ public void UrlEncode_AllRangesAllowed_StillEncodesForbiddenChars()
                             case '_':
                             case '~':
 
-                            // ipchar
-                            case ':':
+                            // isegment-nz-nc
                             case '@':
 
                             // sub-delims