diff --git a/src/Microsoft.Extensions.WebEncoders.Core/Internals/UrlPathDecoder.cs b/src/Microsoft.Extensions.WebEncoders.Core/Internals/UrlPathDecoder.cs new file mode 100644 index 00000000..93884083 --- /dev/null +++ b/src/Microsoft.Extensions.WebEncoders.Core/Internals/UrlPathDecoder.cs @@ -0,0 +1,431 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; +using System.Text; + +namespace Microsoft.Extensions.WebEncoders.Internals +{ + /// + /// UrlPathDecoder unescape a given path string. + /// + public class UrlPathDecoder + { + private static Encoding _utf8 = Encoding.GetEncoding( + "utf-8", + new EncoderExceptionFallback(), + new DecoderExceptionFallback()); + + /// + /// Unescape a url char array in place. Returns the length of the result. + /// + /// - Everything is unescaped except %2F ('/') + /// - UTF8 bytes are tested for formatting, overlong encoding, surrogates and value ranges + /// - Invalid escaped sequence are copied to output as is + /// - It doesn't check if the string contains query + /// + /// + /// The char array contains sequence of charactors to be decoded. The + /// result will be saved in the same array. + /// + /// The length of the result. + public static int DecodeInPlace(char[] buffer) + { + return DecodeInPlace(buffer, buffer.Length); + } + + /// + /// Unescape a url char array in place. Returns the length of the result. + /// + /// - Everything is unescaped except %2F ('/') + /// - UTF8 bytes are tested for formatting, overlong encoding, surrogates and value ranges + /// - Invalid escaped sequence are copied to output as is + /// - It doesn't check if the string contains query + /// + /// + /// The char array contains sequence of charactors to be decoded. The + /// result will be saved in the same array. + /// + /// + /// The lenght of the sequence of characters in buffer to be decoded. + /// + /// The length of the result. + public static int DecodeInPlace(char[] buffer, int len) + { + if (buffer == null) + { + throw new ArgumentNullException(nameof(buffer)); + } + + if (GetNextEncoded(buffer, 0, len) == len) + { + return len; + } + + return DecodeCore(buffer, len); + } + + /// + /// Unescape a url path string. + /// + /// - Everything is unescaped except %2F ('/') + /// - UTF8 bytes are tested for formatting, overlong encoding, surrogates and value ranges + /// - Invalid escaped sequence are copied to output as is + /// - It doesn't check if the string contains query + /// + /// The string to be decoded. + /// The decoded result. + public static string Decode(string original) + { + if (original == null) + { + throw new ArgumentNullException(nameof(original)); + } + + if (original.IndexOf('%') == -1) + { + return original; + } + + var buffer = original.ToCharArray(); + + // decode in place + var len = DecodeCore(buffer, buffer.Length); + return new string(buffer, 0, len); + } + + /// + /// Decode the sequence of charactors in give char array. + /// + /// + /// The array of characters to be decoded. It's both the source and output of the + /// operation which means the decode happens in place. + /// + /// + /// The length of the source sequence in the array to be decoded. + /// + /// + /// The length of the result. + /// + private static int DecodeCore(char[] buffer, int length) + { + // two indices to read and write + var readerPosition = 0; + var writerPosition = 0; + + // operating buffer + var unescapedChars = new char[1]; + var unescapedCharsCount = 0; + var bytesBuffer = new byte[4]; + + while (readerPosition < length) + { + var next = GetNextEncoded(buffer, readerPosition, length); + var copyLength = next - readerPosition; + + CopyInPlace(buffer, length, copyLength, ref readerPosition, ref writerPosition); + + if (readerPosition >= length) + { + break; + } + + var consumed = Unescape(buffer, length, next, bytesBuffer, ref unescapedChars, ref unescapedCharsCount); + if (consumed == 0) + { + // Skip unescaping the % as the sequence follows it can't be correctly + // decoded under UTF8 + CopyInPlace(buffer, length, 1, ref readerPosition, ref writerPosition); + } + else if (unescapedCharsCount == 1 && SkipUnescape(unescapedChars[0])) + { + // Skip unescaping specified characters (eg. '/') + // Copy the original sequence to destination + CopyInPlace(buffer, length, consumed, ref readerPosition, ref writerPosition); + } + else + { + // Copy unescaped chararter. Move to the next charactor in source. + for (int i = 0; i < unescapedCharsCount; ++i) + { + buffer[writerPosition++] = unescapedChars[i]; + } + + readerPosition += consumed; + } + } + + return writerPosition; + } + + private static bool SkipUnescape(char charactor) + { + if (charactor == '/') + { + return true; + } + + return false; + } + + /// + /// Unescape a sequence of characters + /// The sequence is a substring of the given source. The start index must point to + /// a % character initializes the sequence. + /// If the sequence following the % can be successfully decoded in UTF8, + /// - The result char will be set to the unescaped character + /// - The length of the sequence, including the % charactor, will be returned. + /// Otherwise 0 is returned. + /// + private static int Unescape(char[] source, int sourceBoundary, int start, byte[] bytesBuffer, ref char[] output, ref int count) + { + if (start + 2 >= sourceBoundary) + { + return 0; + } + + byte firstByte; + if (!TryGetUnescapedByte(source, start, out firstByte)) + { + return 0; + } + + if (firstByte <= 0x7F) + { + // first < U+007F, single byte ASCII + if (output.Length < 1) + { + output = new char[] { (char)firstByte }; + } + else + { + output[0] = (char)firstByte; + } + + count = 1; + return 3; + } + + // anticipate the byte count + int currentDecodeBits = 0; + int bytesCount = 1; + int expectValueMin = 0; + bytesBuffer[0] = (byte)firstByte; + if ((firstByte & 0xE0) == 0xC0) + { + // 110x xxxx, expect 1 more byte + currentDecodeBits = firstByte & 0x1F; + bytesCount = 2; + expectValueMin = 0x80; + } + else if ((firstByte & 0xF0) == 0xE0) + { + // 1110 xxxx, expect 2 more bytes + currentDecodeBits = firstByte & 0x0F; + bytesCount = 3; + expectValueMin = 0x800; + } + else if ((firstByte & 0xF8) == 0xF0) + { + // 1111 0xxx, expect 3 more bytes + currentDecodeBits = firstByte & 0x07; + bytesCount = 4; + expectValueMin = 0x10000; + } + else + { + // invalid + return 0; + } + + if (start + (bytesCount * 3) > sourceBoundary) + { + // less than expected bytes to decode + return 0; + } + + var remainingBytes = bytesCount - 1; + while (remainingBytes > 0) + { + start += 3; + if (source[start] != '%') + { + return 0; + } + + byte v; + if (!TryGetUnescapedByte(source, start, out v)) + { + return 0; + } + + if ((v & 0xC0) != 0x80) + { + return 0; + } + + // append the lower 6bit + currentDecodeBits = (currentDecodeBits << 6) | (v & 0x3F); + + bytesBuffer[bytesCount - remainingBytes] = v; + remainingBytes--; + + if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F) + { + // This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8; + return 0; + } + if (remainingBytes == 2 && currentDecodeBits >= 0x110) + { + // This is going to be out of the upper Unicode bound 0x10FFFF. + return 0; + } + } + + if (currentDecodeBits < expectValueMin) + { + // Overlong encoding (e.g. using 2 bytes to encode something that only needed 1). + return 0; + } + + try + { + // Convert the code point into char + count = _utf8.GetCharCount(bytesBuffer, 0, bytesCount); + if (count > output.Length) + { + output = new char[count]; + } + + _utf8.GetChars(bytesBuffer, 0, bytesCount, output, 0); + return bytesCount * 3; + } + catch (DecoderFallbackException) + { + return 0; + } + } + + /// + /// Copy characters in an array in place. + /// + /// + /// The array containing the characters to be copied. It is both the + /// source and output of the operation. + /// + /// + /// The length of the source sequence + /// + /// + /// The count of the charaters to be copied in the + /// + /// + /// The index where characters are copied from. The parameter will be + /// set to the position right behind the last read character after copy. + /// + /// + /// The index where characters are copied to. The parameter will be set to + /// the position right behind the last written character adter copy. + /// + /// The number of charactors actually be copied. + private static int CopyInPlace(char[] buffer, int sourceLength, int count, ref int readerPosition, ref int writerPosition) + { + if (buffer == null) + { + throw new ArgumentNullException(nameof(buffer)); + } + + if (sourceLength > buffer.Length) + { + throw new ArgumentOutOfRangeException(nameof(sourceLength), $"The length of the source sequence can't be longer than the size of the buffer."); + } + + if (readerPosition < 0 || readerPosition >= sourceLength) + { + throw new ArgumentOutOfRangeException(nameof(readerPosition), $"The index of the source sequence {readerPosition} is out of range."); + } + + if (writerPosition < 0 || writerPosition >= buffer.Length) + { + throw new ArgumentOutOfRangeException(nameof(writerPosition), $"The index of the output sequence {writerPosition} is out of range."); + } + + if (writerPosition > readerPosition) + { + throw new ArgumentException($"The index of output sequence {writerPosition} is behind the read sequence {readerPosition}."); + } + + for (var i = 0; i < count; ++i) + { + buffer[writerPosition++] = buffer[readerPosition++]; + + // when reader pointer surpass the boundary of the source sequence; or + // writer pointer surpass the boundary of the buffer + // return the count of the copied charcters + if (writerPosition >= buffer.Length || readerPosition >= sourceLength) + { + return i + 1; + } + } + + return count; + } + + /// + /// Find the next % in the sequence of range [start, end) + /// + /// The array of character in which the % is seacrhed. + /// The start of the search range. + /// The end of the search range. + /// The index of the first %, or if % is not found. + private static int GetNextEncoded(char[] buffer, int start, int end) + { + for (var i = start; i < end; ++i) + { + if (buffer[i] == '%') + { + return i; + } + } + + return end; + } + + private static bool TryGetUnescapedByte(char[] buffer, int position, out byte result) + { + if (!IsHex(buffer[position + 1]) || !IsHex(buffer[position + 2])) + { + result = default(byte); + return false; + } + else + { + result = (byte)((HexToDec(buffer[position + 1]) << 4) + HexToDec(buffer[position + 2])); + return true; + } + } + + private static bool IsHex(char value) + { + return (((value >= '0') && (value <= '9')) || + ((value >= 'A') && (value <= 'F')) || + ((value >= 'a') && (value <= 'f'))); + } + + private static int HexToDec(char value) + { + if (value <= '9') + { + return value - '0'; + } + else if (value <= 'F') + { + return (value - 'A') + 10; + } + else // a - f + { + return (value - 'a') + 10; + } + } + } +} diff --git a/src/Microsoft.Extensions.WebEncoders.Core/project.json b/src/Microsoft.Extensions.WebEncoders.Core/project.json index 0da3e708..921c9c75 100644 --- a/src/Microsoft.Extensions.WebEncoders.Core/project.json +++ b/src/Microsoft.Extensions.WebEncoders.Core/project.json @@ -15,11 +15,13 @@ "dnxcore50": { "dependencies": { "System.ComponentModel": "4.0.1-beta-*", + "System.Collections": "4.0.11-beta-*", "System.Diagnostics.Debug": "4.0.11-beta-*", "System.IO": "4.0.11-beta-*", "System.Reflection": "4.0.10-*", "System.Resources.ResourceManager": "4.0.1-beta-*", "System.Runtime.Extensions": "4.0.11-beta-*", + "System.Text.Encoding": "4.0.11-beta-*", "System.Threading": "4.0.11-beta-*" } } diff --git a/test/Microsoft.Extensions.WebEncoders.Tests/UrlPathDecoderTests.cs b/test/Microsoft.Extensions.WebEncoders.Tests/UrlPathDecoderTests.cs new file mode 100644 index 00000000..efc90480 --- /dev/null +++ b/test/Microsoft.Extensions.WebEncoders.Tests/UrlPathDecoderTests.cs @@ -0,0 +1,181 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; +using Microsoft.Extensions.WebEncoders.Internals; +using Xunit; + +namespace Microsoft.Extensions.WebEncoders.Tests +{ + public abstract class UrlPathDecoderTestBase + { + protected abstract void PositiveAssert(string raw, string expect); + + protected abstract void PositiveAssert(string raw); + + protected abstract void NegativeAssert(string raw); + + [Fact] + public void Empty() + { + PositiveAssert(string.Empty, string.Empty); + } + + [Fact] + public void WhiteSpace() + { + PositiveAssert(" ", " "); + } + + [Fact] + public void ThrowNullArgument() + { + Assert.Throws(() => UrlPathDecoder.Decode(null)); + } + + [Theory] + [InlineData("/foo/bar", "/foo/bar")] + [InlineData("/foo/BAR", "/foo/BAR")] + [InlineData("/foo/", "/foo/")] + [InlineData("/", "/")] + public void NormalCases(string raw, string expect) + { + PositiveAssert(raw, expect); + } + + [Theory] + [InlineData("%2F", "%2F")] + [InlineData("/foo%2Fbar", "/foo%2Fbar")] + [InlineData("/foo%2F%20bar", "/foo%2F bar")] + public void SkipForwardSlash(string raw, string expect) + { + PositiveAssert(raw, expect); + } + + [Theory] + [InlineData("%C3%84ra%20Benetton", "Ära Benetton")] + [InlineData("%E6%88%91%E8%87%AA%E6%A8%AA%E5%88%80%E5%90%91%E5%A4%A9%E7%AC%91%E5%8E%BB%E7%95%99%E8%82%9D%E8%83%86%E4%B8%A4%E6%98%86%E4%BB%91", "我自横刀向天笑去留肝胆两昆仑")] + public void Internationalized(string raw, string expect) + { + PositiveAssert(raw, expect); + } + + [Theory] + [InlineData("%D0%A4", "Ф")] + [InlineData("%d0%a4", "Ф")] + [InlineData("%E0%A4%AD", "भ")] + [InlineData("%e0%A4%Ad", "भ")] + [InlineData("%F0%A4%AD%A2", "𤭢")] + [InlineData("%F0%a4%Ad%a2", "𤭢")] + [InlineData("%48%65%6C%6C%6F%20%57%6F%72%6C%64", "Hello World")] + [InlineData("%48%65%6C%6C%6F%2D%C2%B5%40%C3%9F%C3%B6%C3%A4%C3%BC%C3%A0%C3%A1", "Hello-µ@ßöäüàá")] + // Overlong borderline cases + public void ValidUTF8(string raw, string expect) + { + PositiveAssert(raw, expect); + } + + [Theory] + // Test the borderline cases of overlong UTF8. + [InlineData("%C2%80")] + [InlineData("%C2%80")] + [InlineData("%E0%A0%80")] + [InlineData("%F0%90%80%80")] + public void ValidUTF8(string raw) + { + PositiveAssert(raw); + } + + [Theory] + // Overlong ASCII + [InlineData("%C0%A4")] + [InlineData("%C1%BF")] + [InlineData("%E0%80%AF")] + [InlineData("%E0%9F%BF")] + [InlineData("%F0%80%80%AF")] + [InlineData("%F0%8F%8F%BF")] + // Incomplete + [InlineData("%")] + [InlineData("%%")] + [InlineData("%A")] + [InlineData("%Y")] + // [InlineData("http://xn--9zt52a.example.org/%e2%80%ae")] + public void InvalidUTF8(string raw) + { + NegativeAssert(raw); + } + } + + public class UrlPathDecoderTests : UrlPathDecoderTestBase + { + protected override void NegativeAssert(string raw) + { + // invalid sequence are left untouched + Assert.Equal(raw, UrlPathDecoder.Decode(raw)); + } + + protected override void PositiveAssert(string raw) + { + Assert.NotEqual(raw, UrlPathDecoder.Decode(raw)); + } + + protected override void PositiveAssert(string raw, string expect) + { + Assert.Equal(expect, UrlPathDecoder.Decode(raw)); + } + } + + public class UrlPathInPlaceDecoderTests : UrlPathDecoderTestBase + { + protected override void PositiveAssert(string raw, string expect) + { + var buf = raw.ToCharArray(); + + var len = UrlPathDecoder.DecodeInPlace(buf); + + Assert.Equal(expect.Length, len); + Assert.Equal(expect.ToCharArray(), new ArraySegment(buf, 0, len)); + } + + protected override void PositiveAssert(string raw) + { + var buf = raw.ToCharArray(); + + var len = UrlPathDecoder.DecodeInPlace(buf); + + Assert.NotEqual(raw.Length, len); + } + + protected override void NegativeAssert(string raw) + { + var buf = raw.ToCharArray(); + + var len = UrlPathDecoder.DecodeInPlace(buf); + + Assert.Equal(raw.Length, len); + Assert.Equal(raw.ToCharArray(), buf); + } + + [Theory] + [InlineData("/foo%2Fbar", 10, "/foo%2Fbar", 10)] + [InlineData("/foo%2Fbar", 9, "/foo%2Fba", 9)] + [InlineData("/foo%2Fbar", 8, "/foo%2Fb", 8)] + [InlineData("%D0%A4", 6, "Ф", 1)] + [InlineData("%D0%A4", 5, "%D0%A", 5)] + [InlineData("%D0%A4", 4, "%D0%", 4)] + [InlineData("%D0%A4", 3, "%D0", 3)] + [InlineData("%D0%A4", 2, "%D", 2)] + [InlineData("%D0%A4", 1, "%", 1)] + [InlineData("%D0%A4", 0, "", 0)] + [InlineData("%C2%B5%40%C3%9F%C3%B6%C3%A4%C3%BC%C3%A0%C3%A1", 45, "µ@ßöäüàá", 8)] + [InlineData("%C2%B5%40%C3%9F%C3%B6%C3%A4%C3%BC%C3%A0%C3%A1", 44, "µ@ßöäüà%C3%A", 12)] + public void DecodeWithBoundary(string raw, int rawLength, string expect, int expectLength) + { + var buf = raw.ToCharArray(); + var len = UrlPathDecoder.DecodeInPlace(buf, rawLength); + + Assert.Equal(expectLength, len); + Assert.Equal(expect.ToCharArray(), new ArraySegment(buf, 0, expectLength)); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.Extensions.WebEncoders.Tests/project.json b/test/Microsoft.Extensions.WebEncoders.Tests/project.json index 87d9e98e..f35c5911 100644 --- a/test/Microsoft.Extensions.WebEncoders.Tests/project.json +++ b/test/Microsoft.Extensions.WebEncoders.Tests/project.json @@ -15,6 +15,7 @@ "dnx451": { }, "dnxcore50": { "dependencies": { + "System.Text.Encoding": "4.0.11-beta-*", "System.Text.Encoding.Extensions": "4.0.11-beta-*" } }