Skip to content
This repository was archived by the owner on Nov 20, 2018. It is now read-only.

UrlEncoder should always encode the U+003A COLON character #235

Merged
merged 1 commit into from
Mar 18, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions src/Microsoft.Framework.WebEncoders.Core/UrlEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,24 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
: base(filter, MaxOutputCharsPerInputChar)
{
// Per RFC 3987, Sec. 2.2, we want encodings that are safe for
// 'isegment', 'iquery', and 'ifragment'. The only thing these
// all have in common is 'ipchar', which is defined as such:
// four particular components: 'isegment', 'ipath-noscheme',
// 'iquery', and 'ifragment'. The relevant definitions are below.
//
// ipath-noscheme = isegment-nz-nc *( "/" isegment )
//
// isegment = *ipchar
//
// isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
// / "@" )
// ; non-zero-length segment without any colon ":"
//
// ipchar = iunreserved / pct-encoded / sub-delims / ":"
// / "@"
//
// iquery = *( ipchar / iprivate / "/" / "?" )
//
// ifragment = *( ipchar / "/" / "?" )
//
// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
//
// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
Expand All @@ -151,15 +163,19 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
//
// From this list, the base encoder forbids "&", "'", "+",
// The only common characters between these four components are the
// intersection of 'isegment-nz-nc' and 'ipchar', which is really
// just 'isegment-nz-nc' (colons forbidden).
//
// From this list, the base encoder already forbids "&", "'", "+",
// and we'll additionally forbid "=" since it has special meaning
// in x-www-form-urlencoded representations.
//
// This means that the full list of allowed characters from the
// Basic Latin set is:
// ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / "@"
// ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@"

const string forbiddenChars = @" #%/=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
const string forbiddenChars = @" #%/:=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
foreach (char c in forbiddenChars)
{
ForbidCharacter(c);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ public void UrlEncode_AllRangesAllowed_StillEncodesForbiddenChars()
case '_':
case '~':

// ipchar
case ':':
// isegment-nz-nc
case '@':

// sub-delims
Expand Down