Skip to content
This repository was archived by the owner on Nov 20, 2018. It is now read-only.

Commit 3607cd4

Browse files
committed
Add Url Path Decoder
1 parent 221df71 commit 3607cd4

File tree

4 files changed

+288
-0
lines changed

4 files changed

+288
-0
lines changed
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
3+
4+
namespace Microsoft.Extensions.WebEncoders
5+
{
6+
public class PathDecoder
7+
{
8+
public static string Decode(string raw)
9+
{
10+
if (NextPercentage(0, raw) == raw.Length)
11+
{
12+
return raw;
13+
}
14+
15+
var buf = new char[raw.Length];
16+
17+
int len;
18+
DecodeCore(raw, buf, out len);
19+
20+
return new string(buf, 0, len);
21+
}
22+
23+
private static void DecodeCore(string source, char[] buffer, out int length)
24+
{
25+
var bufferPosition = 0;
26+
var sourcePosition = 0;
27+
char unescapedChar = default(char);
28+
29+
while (sourcePosition < source.Length)
30+
{
31+
var next = NextPercentage(sourcePosition, source);
32+
var copyLength = next - sourcePosition;
33+
Copy(source, ref sourcePosition, buffer, ref bufferPosition, copyLength);
34+
35+
if (sourcePosition >= source.Length)
36+
{
37+
break;
38+
}
39+
40+
var consumed = Unescape(source, next, ref unescapedChar);
41+
if (consumed == 0)
42+
{
43+
// Skip unescaping the % as the sequence follows it can't be correctly
44+
// decoded under UTF8
45+
Copy(source, ref sourcePosition, buffer, ref bufferPosition, 1);
46+
}
47+
else if (SkipUnescape(unescapedChar))
48+
{
49+
// Skip unescaping specified characters (eg. '/')
50+
// Copy the original sequence to destination
51+
Copy(source, ref sourcePosition, buffer, ref bufferPosition, consumed);
52+
}
53+
else
54+
{
55+
// Copy unescaped chararter. Move to the next charactor in source.
56+
buffer[bufferPosition++] = unescapedChar;
57+
sourcePosition += consumed;
58+
}
59+
}
60+
61+
length = bufferPosition;
62+
}
63+
64+
private static bool SkipUnescape(char charactor)
65+
{
66+
if (charactor == '/')
67+
{
68+
return true;
69+
}
70+
71+
return false;
72+
}
73+
74+
/// <summary>
75+
/// Unescape a sequence of characters
76+
/// The sequence is a substring of the given source. The start index must point to
77+
/// a % character initializes the sequence.
78+
/// If the sequence following the % can be successfully decoded in UTF8,
79+
/// - The result char will be set to the unescaped character
80+
/// - The length of the sequence, including the % charactor, will be returned.
81+
/// Otherwise 0 is returned.
82+
/// </summary>
83+
private static int Unescape(string source, int start, ref char result)
84+
{
85+
if (start + 2 >= source.Length)
86+
{
87+
return 0;
88+
}
89+
90+
char first;
91+
if (!TryGetUnescapedAscii(source[start + 1], source[start + 2], out first))
92+
{
93+
return 0;
94+
}
95+
96+
if ((first & 0x80) == 0)
97+
{
98+
// first < U+007F, single byte ASCII
99+
result = first;
100+
return 3;
101+
}
102+
103+
// anticipate the byte count
104+
int currentDecodeBits = 0;
105+
int expectBytes = 0;
106+
int expectValueMin = 0;
107+
if ((first & 0xE0) == 0xC0)
108+
{
109+
// 110x xxxx, expect 1 more byte
110+
currentDecodeBits = first & 0x1F;
111+
expectBytes = 1;
112+
expectValueMin = 0x80;
113+
}
114+
else if ((first & 0xF0) == 0xE0)
115+
{
116+
// 1110 xxxx, expect 2 more bytes
117+
currentDecodeBits = first & 0x0F;
118+
expectBytes = 2;
119+
expectValueMin = 0x800;
120+
}
121+
else if ((first & 0xF8) == 0xF0)
122+
{
123+
// 1111 0xxx, expect 3 more bytes
124+
currentDecodeBits = first & 0x07;
125+
expectBytes = 3;
126+
expectValueMin = 0x10000;
127+
}
128+
else
129+
{
130+
// invalid
131+
return 0;
132+
}
133+
134+
if (start + 2 + (expectBytes * 3) >= source.Length)
135+
{
136+
// less than expected bytes to decode
137+
return 0;
138+
}
139+
140+
var remainingBytes = expectBytes;
141+
while (remainingBytes > 0)
142+
{
143+
start += 3;
144+
if (source[start] != '%')
145+
{
146+
return 0;
147+
}
148+
149+
char v;
150+
if (!TryGetUnescapedAscii(source[start + 1], source[start + 2], out v))
151+
{
152+
return 0;
153+
}
154+
155+
if ((v & 0xC0) != 0x80)
156+
{
157+
return 0;
158+
}
159+
160+
// append the lower 6bit
161+
currentDecodeBits = (currentDecodeBits << 6) | (v & 0x3F);
162+
163+
remainingBytes--;
164+
165+
if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
166+
{
167+
// This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8;
168+
return 0;
169+
}
170+
if (remainingBytes == 2 && currentDecodeBits >= 0x110)
171+
{
172+
// This is going to be out of the upper Unicode bound 0x10FFFF.
173+
return 0;
174+
}
175+
}
176+
177+
if (currentDecodeBits < expectValueMin)
178+
{
179+
// Overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
180+
return 0;
181+
}
182+
183+
// Convert the code point into char
184+
result = (char)currentDecodeBits;
185+
return (expectBytes + 1) * 3;
186+
}
187+
188+
/// <summary>
189+
/// Copy characters in source string to target char array. After copied the indices points at the beginning
190+
/// offset of both source and destination are updated to point to the position after the last copied character
191+
/// </summary>
192+
private static int Copy(string source, ref int sourceStart, char[] destination, ref int destinatonStart, int length)
193+
{
194+
for (var i = 0; i < length; ++i)
195+
{
196+
destination[destinatonStart++] = source[sourceStart++];
197+
198+
if (destinatonStart >= destination.Length ||
199+
sourceStart >= source.Length)
200+
{
201+
return i + 1;
202+
}
203+
}
204+
205+
return length;
206+
}
207+
208+
/// <summary>
209+
/// Find the next % in the sequence. If % is not found, return the sequence length.
210+
/// </summary>
211+
private static int NextPercentage(int start, string source)
212+
{
213+
for (var i = start; i < source.Length; ++i)
214+
{
215+
if (source[i] == '%')
216+
{
217+
return i;
218+
}
219+
}
220+
221+
return source.Length;
222+
}
223+
224+
private static bool TryGetUnescapedAscii(char first, char second, out char result)
225+
{
226+
if (!IsHex(first) || !IsHex(second))
227+
{
228+
result = default(char);
229+
return false;
230+
}
231+
232+
// result in range [0000, 00FF]
233+
result = (char)((HexToDec(first) << 4) + (HexToDec(second)));
234+
return true;
235+
}
236+
237+
private static bool IsHex(char value)
238+
{
239+
return (((value >= '0') && (value <= '9')) ||
240+
((value >= 'A') && (value <= 'F')) ||
241+
((value >= 'a') && (value <= 'f')));
242+
}
243+
244+
private static int HexToDec(char value)
245+
{
246+
if (value <= '9')
247+
{
248+
return value - '0';
249+
}
250+
else if (value <= 'F')
251+
{
252+
return (value - 'A') + 10;
253+
}
254+
else // a - f
255+
{
256+
return (value - 'a') + 10;
257+
}
258+
}
259+
}
260+
}

src/Microsoft.Extensions.WebEncoders.Core/project.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
"dnxcore50": {
1616
"dependencies": {
1717
"System.ComponentModel": "4.0.1-beta-*",
18+
"System.Collections": "4.0.11-beta-*",
1819
"System.Diagnostics.Debug": "4.0.11-beta-*",
1920
"System.IO": "4.0.11-beta-*",
2021
"System.Reflection": "4.0.10-*",
2122
"System.Resources.ResourceManager": "4.0.1-beta-*",
2223
"System.Runtime.Extensions": "4.0.11-beta-*",
24+
"System.Text.Encoding": "4.0.11-beta-*",
2325
"System.Threading": "4.0.11-beta-*"
2426
}
2527
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
3+
4+
using Xunit;
5+
6+
namespace Microsoft.Extensions.WebEncoders.Tests
7+
{
8+
public class PathDecoderTests
9+
{
10+
[Theory]
11+
[InlineData("/Path", "/Path")]
12+
[InlineData("/Path%20space", "/Path space")]
13+
[InlineData("/Path%2Fspace", "/Path%2Fspace")]
14+
[InlineData("/Path/space%", "/Path/space%")]
15+
[InlineData("/Path/space%a", "/Path/space%a")]
16+
[InlineData("%C3%84ra%20Benetton", "Ära Benetton")]
17+
[InlineData("%E6%88%91%E8%87%AA%E6%A8%AA%E5%88%80%E5%90%91%E5%A4%A9%E7%AC%91%E5%8E%BB%E7%95%99%E8%82%9D%E8%83%86%E4%B8%A4%E6%98%86%E4%BB%91", "我自横刀向天笑去留肝胆两昆仑")]
18+
[InlineData("%", "%")]
19+
[InlineData("%%", "%%")]
20+
public void DecodeUri(string raw, string expect)
21+
{
22+
Assert.Equal(expect, PathDecoder.Decode(raw));
23+
}
24+
}
25+
}

test/Microsoft.Extensions.WebEncoders.Tests/project.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"dnx451": { },
1616
"dnxcore50": {
1717
"dependencies": {
18+
"System.Text.Encoding": "4.0.11-beta-*",
1819
"System.Text.Encoding.Extensions": "4.0.11-beta-*"
1920
}
2021
}

0 commit comments

Comments
 (0)