Skip to content
This repository was archived by the owner on Nov 20, 2018. It is now read-only.

Commit 43d0b0f

Browse files
Update WebEncoders from Unicode 7.0 to Unicode 8.0
Add "how to update" file detailing update steps
1 parent f3e8288 commit 43d0b0f

File tree

10 files changed

+2189
-123
lines changed

10 files changed

+2189
-123
lines changed

src/Microsoft.Framework.WebEncoders.Core/UnicodeHelpers.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ internal unsafe static class UnicodeHelpers
2828

2929
/// <summary>
3030
/// Helper method which creates a bitmap of all characters which are
31-
/// defined per version 7.0.0 of the Unicode specification.
31+
/// defined per version 8.0 of the Unicode specification.
3232
/// </summary>
3333
[MethodImpl(MethodImplOptions.NoInlining)]
3434
private static uint[] CreateDefinedCharacterBitmap()
3535
{
3636
// The stream should be exactly 8KB in size.
3737
var assembly = typeof(UnicodeHelpers).GetTypeInfo().Assembly;
38-
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-7.0.0-defined-characters.bin";
38+
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-defined-chars.bin";
3939

4040
var stream = assembly.GetManifestResourceStream(resourceName);
4141
if (stream.Length != 8 * 1024)
@@ -72,7 +72,7 @@ private static uint[] CreateDefinedCharacterBitmap()
7272
}
7373

7474
/// <summary>
75-
/// Returns a bitmap of all characters which are defined per version 7.0.0
75+
/// Returns a bitmap of all characters which are defined per version 8.0
7676
/// of the Unicode specification.
7777
/// </summary>
7878
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -204,7 +204,7 @@ internal static int GetUtf8RepresentationForScalarValue(uint scalar)
204204
}
205205

206206
/// <summary>
207-
/// Returns a value stating whether a character is defined per version 7.0.0
207+
/// Returns a value stating whether a character is defined per version 8.0
208208
/// of the Unicode specification. Certain classes of characters (control chars,
209209
/// private use, surrogates, some whitespace) are considered "undefined" for
210210
/// our purposes.

src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace Microsoft.Framework.WebEncoders
1010
{
1111
/// <summary>
1212
/// Contains predefined <see cref="UnicodeRange"/> instances which correspond to blocks
13-
/// from the Unicode 7.0 specification.
13+
/// from the Unicode 8.0 specification.
1414
/// </summary>
1515
public static partial class UnicodeRanges
1616
{

src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.generated.cs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,15 @@ public static partial class UnicodeRanges
12771277
public static UnicodeRange LatinExtendedE => Volatile.Read(ref _latinExtendedE) ?? CreateRange(ref _latinExtendedE, first: '\uAB30', last: '\uAB6F');
12781278
private static UnicodeRange _latinExtendedE;
12791279

1280+
/// <summary>
1281+
/// A <see cref="UnicodeRange"/> corresponding to the 'Cherokee Supplement' Unicode block (U+AB70..U+ABBF).
1282+
/// </summary>
1283+
/// <remarks>
1284+
/// See http://www.unicode.org/charts/PDF/UAB70.pdf for the full set of characters in this block.
1285+
/// </remarks>
1286+
public static UnicodeRange CherokeeSupplement => Volatile.Read(ref _cherokeeSupplement) ?? CreateRange(ref _cherokeeSupplement, first: '\uAB70', last: '\uABBF');
1287+
private static UnicodeRange _cherokeeSupplement;
1288+
12801289
/// <summary>
12811290
/// A <see cref="UnicodeRange"/> corresponding to the 'Meetei Mayek' Unicode block (U+ABC0..U+ABFF).
12821291
/// </summary>
@@ -1303,7 +1312,7 @@ public static partial class UnicodeRanges
13031312
/// </remarks>
13041313
public static UnicodeRange HangulJamoExtendedB => Volatile.Read(ref _hangulJamoExtendedB) ?? CreateRange(ref _hangulJamoExtendedB, first: '\uD7B0', last: '\uD7FF');
13051314
private static UnicodeRange _hangulJamoExtendedB;
1306-
1315+
13071316
/// <summary>
13081317
/// A <see cref="UnicodeRange"/> corresponding to the 'CJK Compatibility Ideographs' Unicode block (U+F900..U+FAFF).
13091318
/// </summary>

test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,15 +193,15 @@ private static bool[] ReadListOfDefinedCharacters()
193193
}
194194
}
195195

196-
// Handle known spans from Unicode 7.0.0's UnicodeData.txt
196+
// Handle known spans from Unicode 8.0's UnicodeData.txt
197197

198198
// CJK Ideograph Extension A
199199
for (int i = '\u3400'; i <= '\u4DB5'; i++)
200200
{
201201
retVal[i] = true;
202202
}
203203
// CJK Ideograph
204-
for (int i = '\u4E00'; i <= '\u9FCC'; i++)
204+
for (int i = '\u4E00'; i <= '\u9FD5'; i++)
205205
{
206206
retVal[i] = true;
207207
}

test/Microsoft.Framework.WebEncoders.Tests/UnicodeRangesTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ public void Range_All()
172172
[InlineData('\uAAE0', '\uAAFF', nameof(UnicodeRanges.MeeteiMayekExtensions))]
173173
[InlineData('\uAB00', '\uAB2F', nameof(UnicodeRanges.EthiopicExtendedA))]
174174
[InlineData('\uAB30', '\uAB6F', nameof(UnicodeRanges.LatinExtendedE))]
175+
[InlineData('\uAB70', '\uABBF', nameof(UnicodeRanges.CherokeeSupplement))]
175176
[InlineData('\uABC0', '\uABFF', nameof(UnicodeRanges.MeeteiMayek))]
176177
[InlineData('\uAC00', '\uD7AF', nameof(UnicodeRanges.HangulSyllables))]
177178
[InlineData('\uD7B0', '\uD7FF', nameof(UnicodeRanges.HangulJamoExtendedB))]

unicode/Blocks.txt

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
1-
# Blocks-7.0.0.txt
2-
# Date: 2014-04-03, 23:23:00 GMT [RP, KW]
1+
# Blocks-8.0.0.txt
2+
# Date: 2014-11-10, 23:04:00 GMT [KW]
33
#
44
# Unicode Character Database
55
# Copyright (c) 1991-2014 Unicode, Inc.
66
# For terms of use, see http://www.unicode.org/terms_of_use.html
77
# For documentation, see http://www.unicode.org/reports/tr44/
88
#
9-
# Note: The casing of block names is not normative.
10-
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
11-
#
129
# Format:
1310
# Start Code..End Code; Block Name
1411

@@ -20,6 +17,14 @@
2017
# For more information on the comparison of property values,
2118
# see UAX #44: http://www.unicode.org/reports/tr44/
2219
#
20+
# All block ranges start with a value where (cp MOD 16) = 0,
21+
# and end with a value where (cp MOD 16) = 15. In other words,
22+
# the last hexadecimal digit of the start of range is ...0
23+
# and the last hexadecimal digit of the end of range is ...F.
24+
# This constraint on block ranges guarantees that allocations
25+
# are done in terms of whole columns, and that code chart display
26+
# never involves splitting columns in the charts.
27+
#
2328
# All code points not explicitly listed for Block
2429
# have the value No_Block.
2530

@@ -168,6 +173,7 @@ AA80..AADF; Tai Viet
168173
AAE0..AAFF; Meetei Mayek Extensions
169174
AB00..AB2F; Ethiopic Extended-A
170175
AB30..AB6F; Latin Extended-E
176+
AB70..ABBF; Cherokee Supplement
171177
ABC0..ABFF; Meetei Mayek
172178
AC00..D7AF; Hangul Syllables
173179
D7B0..D7FF; Hangul Jamo Extended-B
@@ -210,6 +216,7 @@ FFF0..FFFF; Specials
210216
10840..1085F; Imperial Aramaic
211217
10860..1087F; Palmyrene
212218
10880..108AF; Nabataean
219+
108E0..108FF; Hatran
213220
10900..1091F; Phoenician
214221
10920..1093F; Lydian
215222
10980..1099F; Meroitic Hieroglyphs
@@ -223,6 +230,7 @@ FFF0..FFFF; Specials
223230
10B60..10B7F; Inscriptional Pahlavi
224231
10B80..10BAF; Psalter Pahlavi
225232
10C00..10C4F; Old Turkic
233+
10C80..10CFF; Old Hungarian
226234
10E60..10E7F; Rumi Numeral Symbols
227235
11000..1107F; Brahmi
228236
11080..110CF; Kaithi
@@ -232,17 +240,21 @@ FFF0..FFFF; Specials
232240
11180..111DF; Sharada
233241
111E0..111FF; Sinhala Archaic Numbers
234242
11200..1124F; Khojki
243+
11280..112AF; Multani
235244
112B0..112FF; Khudawadi
236245
11300..1137F; Grantha
237246
11480..114DF; Tirhuta
238247
11580..115FF; Siddham
239248
11600..1165F; Modi
240249
11680..116CF; Takri
250+
11700..1173F; Ahom
241251
118A0..118FF; Warang Citi
242252
11AC0..11AFF; Pau Cin Hau
243253
12000..123FF; Cuneiform
244254
12400..1247F; Cuneiform Numbers and Punctuation
255+
12480..1254F; Early Dynastic Cuneiform
245256
13000..1342F; Egyptian Hieroglyphs
257+
14400..1467F; Anatolian Hieroglyphs
246258
16800..16A3F; Bamum Supplement
247259
16A40..16A6F; Mro
248260
16AD0..16AFF; Bassa Vah
@@ -257,6 +269,7 @@ FFF0..FFFF; Specials
257269
1D300..1D35F; Tai Xuan Jing Symbols
258270
1D360..1D37F; Counting Rod Numerals
259271
1D400..1D7FF; Mathematical Alphanumeric Symbols
272+
1D800..1DAAF; Sutton SignWriting
260273
1E800..1E8DF; Mende Kikakui
261274
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
262275
1F000..1F02F; Mahjong Tiles
@@ -271,9 +284,11 @@ FFF0..FFFF; Specials
271284
1F700..1F77F; Alchemical Symbols
272285
1F780..1F7FF; Geometric Shapes Extended
273286
1F800..1F8FF; Supplemental Arrows-C
287+
1F900..1F9FF; Supplemental Symbols and Pictographs
274288
20000..2A6DF; CJK Unified Ideographs Extension B
275289
2A700..2B73F; CJK Unified Ideographs Extension C
276290
2B740..2B81F; CJK Unified Ideographs Extension D
291+
2B820..2CEAF; CJK Unified Ideographs Extension E
277292
2F800..2FA1F; CJK Compatibility Ideographs Supplement
278293
E0000..E007F; Tags
279294
E0100..E01EF; Variation Selectors Supplement

0 commit comments

Comments
 (0)