Add Url Path Decoder

troydai · troydai · commit 60d50aa3e0f3 · 2015-10-20T01:20:09.000-07:00
diff --git a/src/Microsoft.Extensions.WebEncoders.Core/PathDecoder.cs b/src/Microsoft.Extensions.WebEncoders.Core/PathDecoder.cs
@@ -0,0 +1,259 @@
+﻿// Copyright (c) .NET Foundation. All rights reserved.
+// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
+
+namespace Microsoft.Extensions.WebEncoders
+{
+    public class PathDecoder
+    {
+        public static string Decode(string raw)
+        {
+            if (NextPercentage(0, raw) == raw.Length)
+            {
+                return raw;
+            }
+
+            var buf = new char[raw.Length];
+
+            int len;
+            DecodeCore(raw, buf, out len);
+
+            return new string(buf, 0, len);
+        }
+
+        private static void DecodeCore(string source, char[] buffer, out int length)
+        {
+            var bufferPosition = 0;
+            var sourcePosition = 0;
+            char unescapedChar = default(char);
+
+            while (sourcePosition < source.Length)
+            {
+                var next = NextPercentage(sourcePosition, source);
+                var copyLength = next - sourcePosition;
+                Copy(source, ref sourcePosition, buffer, ref bufferPosition, copyLength);
+
+                if (sourcePosition >= source.Length)
+                {
+                    break;
+                }
+                var consumed = Unescape(source, next, ref unescapedChar);
+                if (consumed == 0)
+                {
+                    // Skip unescaping the % as the sequence follows it can't be correctly
+                    // decoded under UTF8
+                    Copy(source, ref sourcePosition, buffer, ref bufferPosition, 1);
+                }
+                else if (SkipUnescape(unescapedChar))
+                {
+                    // Skip unescaping specified characters (eg. '/')
+                    // Copy the original sequence to destination
+                    Copy(source, ref sourcePosition, buffer, ref bufferPosition, consumed);
+                }
+                else
+                {
+                    // Copy unescaped chararter. Move to the next charactor in source.
+                    buffer[bufferPosition++] = unescapedChar;
+                    sourcePosition += consumed;
+                }
+            }
+
+            length = bufferPosition;
+        }
+
+        private static bool SkipUnescape(char charactor)
+        {
+            if (charactor == '/')
+            {
+                return true;
+            }
+
+            return false;
+        }
+
+        /// <summary>
+        /// Unescape a sequence of characters
+        /// The sequence is a substring of the given source. The start index must point to
+        /// a % character initializes the sequence.
+        /// If the sequence following the % can be successfully decoded in UTF8,
+        /// - The result char will be set to the unescaped character
+        /// - The length of the sequence, including the % charactor, will be returned.
+        /// Otherwise 0 is returned.
+        /// </summary>
+        private static int Unescape(string source, int start, ref char result)
+        {
+            if (start + 2 >= source.Length)
+            {
+                return 0;
+            }
+
+            char first;
+            if (!TryGetUnescapedAscii(source[start + 1], source[start + 2], out first))
+            {
+                return 0;
+            }
+
+            if ((first & 0x80) == 0)
+            {
+                // first < U+007F, single byte ASCII
+                result = first;
+                return 3;
+            }
+
+            // anticipate the byte count
+            int currentDecodeBits = 0;
+            int expectBytes = 0;
+            int expectValueMin = 0;
+            if ((first & 0xE0) == 0xC0)
+            {
+                // 110x xxxx, expect 1 more byte
+                currentDecodeBits = first & 0x1F;
+                expectBytes = 1;
+                expectValueMin = 0x80;
+            }
+            else if ((first & 0xF0) == 0xE0)
+            {
+                // 1110 xxxx, expect 2 more bytes
+                currentDecodeBits = first & 0x0F;
+                expectBytes = 2;
+                expectValueMin = 0x800;
+            }
+            else if ((first & 0xF8) == 0xF0)
+            {
+                // 1111 0xxx, expect 3 more bytes
+                currentDecodeBits = first & 0x07;
+                expectBytes = 3;
+                expectValueMin = 0x10000;
+            }
+            else
+            {
+                // invalid
+                return 0;
+            }
+
+            if (start + 2 + (expectBytes * 3) >= source.Length)
+            {
+                // less than expected bytes to decode
+                return 0;
+            }
+
+            var remainingBytes = expectBytes;
+            while (remainingBytes > 0)
+            {
+                start += 3;
+                if (source[start] != '%')
+                {
+                    return 0;
+                }
+
+                char v;
+                if (!TryGetUnescapedAscii(source[start + 1], source[start + 2], out v))
+                {
+                    return 0;
+                }
+
+                if ((v & 0xC0) != 0x80)
+                {
+                    return 0;
+                }
+
+                // append the lower 6bit
+                currentDecodeBits = (currentDecodeBits << 6) | (v & 0x3F);
+
+                remainingBytes--;
+
+                if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
+                {
+                    // This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8;
+                    return 0;
+                }
+                if (remainingBytes == 2 && currentDecodeBits >= 0x110)
+                {
+                    // This is going to be out of the upper Unicode bound 0x10FFFF.
+                    return 0;
+                }
+            }
+
+            if (currentDecodeBits < expectValueMin)
+            {
+                // Overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
+                return 0;
+            }
+
+            // Convert the code point into char
+            result = char.ConvertFromUtf32(currentDecodeBits)[0];
+            return (expectBytes + 1) * 3;
+        }
+
+        /// <summary>
+        /// Copy characters in source string to target char array. After copied the indices points at the beginning
+        /// offset of both source and destination are updated to point to the position after the last copied character
+        /// </summary>
+        private static int Copy(string source, ref int sourceStart, char[] destination, ref int destinatonStart, int length)
+        {
+            for (var i = 0; i < length; ++i)
+            {
+                destination[destinatonStart++] = source[sourceStart++];
+
+                if (destinatonStart >= destination.Length ||
+                    sourceStart >= source.Length)
+                {
+                    return i + 1;
+                }
+            }
+
+            return length;
+        }
+
+        /// <summary>
+        /// Find the next % in the sequence. If % is not found, return the sequence length.
+        /// </summary>
+        private static int NextPercentage(int start, string source)
+        {
+            for (var i = start; i < source.Length; ++i)
+            {
+                if (source[i] == '%')
+                {
+                    return i;
+                }
+            }
+
+            return source.Length;
+        }
+
+        private static bool TryGetUnescapedAscii(char first, char second, out char result)
+        {
+            if (!IsHex(first) || !IsHex(second))
+            {
+                result = default(char);
+                return false;
+            }
+
+            // result in range [0000, 00FF]
+            result = (char)((HexToDec(first) << 4) + (HexToDec(second)));
+            return true;
+        }
+
+        private static bool IsHex(char value)
+        {
+            return (((value >= '0') && (value <= '9')) ||
+                    ((value >= 'A') && (value <= 'F')) ||
+                    ((value >= 'a') && (value <= 'f')));
+        }
+
+        private static int HexToDec(char value)
+        {
+            if (value <= '9')
+            {
+                return value - '0';
+            }
+            else if (value <= 'F')
+            {
+                return (value - 'A') + 10;
+            }
+            else // a - f
+            {
+                return (value - 'a') + 10;
+            }
+        }
+    }
+}
diff --git a/src/Microsoft.Extensions.WebEncoders.Core/project.json b/src/Microsoft.Extensions.WebEncoders.Core/project.json
@@ -15,11 +15,13 @@
         "dnxcore50": {
             "dependencies": {
                 "System.ComponentModel": "4.0.1-beta-*",
+                "System.Collections": "4.0.11-beta-*",
                 "System.Diagnostics.Debug": "4.0.11-beta-*",
                 "System.IO": "4.0.11-beta-*",
                 "System.Reflection": "4.0.10-*",
                 "System.Resources.ResourceManager": "4.0.1-beta-*",
                 "System.Runtime.Extensions": "4.0.11-beta-*",
+                "System.Text.Encoding": "4.0.11-beta-*",
                 "System.Threading": "4.0.11-beta-*"
             }
         }
diff --git a/test/Microsoft.Extensions.WebEncoders.Tests/PathDecoderTests.cs b/test/Microsoft.Extensions.WebEncoders.Tests/PathDecoderTests.cs
@@ -0,0 +1,25 @@
+﻿// Copyright (c) .NET Foundation. All rights reserved.
+// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
+
+using Xunit;
+
+namespace Microsoft.Extensions.WebEncoders.Tests
+{
+    public class PathDecoderTests
+    {
+        [Theory]
+        [InlineData("/Path", "/Path")]
+        [InlineData("/Path%20space", "/Path space")]
+        [InlineData("/Path%2Fspace", "/Path%2Fspace")]
+        [InlineData("/Path/space%", "/Path/space%")]
+        [InlineData("/Path/space%a", "/Path/space%a")]
+        [InlineData("%C3%84ra%20Benetton", "Ära Benetton")]
+        [InlineData("%E6%88%91%E8%87%AA%E6%A8%AA%E5%88%80%E5%90%91%E5%A4%A9%E7%AC%91%E5%8E%BB%E7%95%99%E8%82%9D%E8%83%86%E4%B8%A4%E6%98%86%E4%BB%91", "我自横刀向天笑去留肝胆两昆仑")]
+        [InlineData("%", "%")]
+        [InlineData("%%", "%%")]
+        public void DecodeUri(string raw, string expect)
+        {
+            Assert.Equal(expect, PathDecoder.Decode(raw));
+        }
+    }
+}
diff --git a/test/Microsoft.Extensions.WebEncoders.Tests/project.json b/test/Microsoft.Extensions.WebEncoders.Tests/project.json
@@ -15,6 +15,7 @@
     "dnx451": { },
     "dnxcore50": {
       "dependencies": {
+        "System.Text.Encoding": "4.0.11-beta-*",
         "System.Text.Encoding.Extensions": "4.0.11-beta-*"
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -15,11 +15,13 @@`
`15`	`15`	`"dnxcore50": {`
`16`	`16`	`"dependencies": {`
`17`	`17`	`"System.ComponentModel": "4.0.1-beta-*",`
	`18`	`+ "System.Collections": "4.0.11-beta-*",`
`18`	`19`	`"System.Diagnostics.Debug": "4.0.11-beta-*",`
`19`	`20`	`"System.IO": "4.0.11-beta-*",`
`20`	`21`	`"System.Reflection": "4.0.10-*",`
`21`	`22`	`"System.Resources.ResourceManager": "4.0.1-beta-*",`
`22`	`23`	`"System.Runtime.Extensions": "4.0.11-beta-*",`
	`24`	`+ "System.Text.Encoding": "4.0.11-beta-*",`
`23`	`25`	`"System.Threading": "4.0.11-beta-*"`
`24`	`26`	`}`
`25`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`"dnx451": { },`
`16`	`16`	`"dnxcore50": {`
`17`	`17`	`"dependencies": {`
	`18`	`+ "System.Text.Encoding": "4.0.11-beta-*",`
`18`	`19`	`"System.Text.Encoding.Extensions": "4.0.11-beta-*"`
`19`	`20`	`}`
`20`	`21`	`}`