diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c620f51206..f1e002a7bc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -480,6 +480,7 @@ add_library(${TARGET} ${GGML_SOURCES_OPENCL} whisper.h whisper.cpp + unicode.h ) include(DefaultTargetOptions) diff --git a/Makefile b/Makefile index 4a676f1ff6b..c3811eb6fbb 100644 --- a/Makefile +++ b/Makefile @@ -325,7 +325,7 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h WHISPER_OBJ += ggml.o ggml-alloc.o ggml-backend.o ggml-quants.o -whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h +whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ ifndef WHISPER_COREML diff --git a/bindings/ruby/ext/unicode.h b/bindings/ruby/ext/unicode.h new file mode 100644 index 00000000000..bf7f105dfaf --- /dev/null +++ b/bindings/ruby/ext/unicode.h @@ -0,0 +1,742 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +static const std::vector> number_ranges = { + {0x30,0x39},{0xb2,0xb3},{0xb9,0xb9},{0xbc,0xbe},{0x660,0x669}, + {0x6f0,0x6f9},{0x7c0,0x7c9},{0x966,0x96f},{0x9e6,0x9ef},{0x9f4,0x9f9}, + {0xa66,0xa6f},{0xae6,0xaef},{0xb66,0xb6f},{0xb72,0xb77},{0xbe6,0xbf2}, + {0xc66,0xc6f},{0xc78,0xc7e},{0xce6,0xcef},{0xd58,0xd5e},{0xd66,0xd78}, + {0xde6,0xdef},{0xe50,0xe59},{0xed0,0xed9},{0xf20,0xf33},{0x1040,0x1049}, + {0x1090,0x1099},{0x1369,0x137c},{0x16ee,0x16f0},{0x17e0,0x17e9},{0x17f0,0x17f9}, + {0x1810,0x1819},{0x1946,0x194f},{0x19d0,0x19da},{0x1a80,0x1a89},{0x1a90,0x1a99}, + {0x1b50,0x1b59},{0x1bb0,0x1bb9},{0x1c40,0x1c49},{0x1c50,0x1c59},{0x2070,0x2070}, + {0x2074,0x2079},{0x2080,0x2089},{0x2150,0x2182},{0x2185,0x2189},{0x2460,0x249b}, + {0x24ea,0x24ff},{0x2776,0x2793},{0x2cfd,0x2cfd},{0x3007,0x3007},{0x3021,0x3029}, + {0x3038,0x303a},{0x3192,0x3195},{0x3220,0x3229},{0x3248,0x324f},{0x3251,0x325f}, + {0x3280,0x3289},{0x32b1,0x32bf},{0xa620,0xa629},{0xa6e6,0xa6ef},{0xa830,0xa835}, + {0xa8d0,0xa8d9},{0xa900,0xa909},{0xa9d0,0xa9d9},{0xa9f0,0xa9f9},{0xaa50,0xaa59}, + {0xabf0,0xabf9},{0xff10,0xff19},{0x10107,0x10133},{0x10140,0x10178},{0x1018a,0x1018b}, + {0x102e1,0x102fb},{0x10320,0x10323},{0x10341,0x10341},{0x1034a,0x1034a},{0x103d1,0x103d5}, + {0x104a0,0x104a9},{0x10858,0x1085f},{0x10879,0x1087f},{0x108a7,0x108af},{0x108fb,0x108ff}, + {0x10916,0x1091b},{0x109bc,0x109bd},{0x109c0,0x109cf},{0x109d2,0x109ff},{0x10a40,0x10a48}, + {0x10a7d,0x10a7e},{0x10a9d,0x10a9f},{0x10aeb,0x10aef},{0x10b58,0x10b5f},{0x10b78,0x10b7f}, + {0x10ba9,0x10baf},{0x10cfa,0x10cff},{0x10d30,0x10d39},{0x10e60,0x10e7e},{0x10f1d,0x10f26}, + {0x10f51,0x10f54},{0x10fc5,0x10fcb},{0x11052,0x1106f},{0x110f0,0x110f9},{0x11136,0x1113f}, + {0x111d0,0x111d9},{0x111e1,0x111f4},{0x112f0,0x112f9},{0x11450,0x11459},{0x114d0,0x114d9}, + {0x11650,0x11659},{0x116c0,0x116c9},{0x11730,0x1173b},{0x118e0,0x118f2},{0x11950,0x11959}, + {0x11c50,0x11c6c},{0x11d50,0x11d59},{0x11da0,0x11da9},{0x11fc0,0x11fd4},{0x12400,0x1246e}, + {0x16a60,0x16a69},{0x16ac0,0x16ac9},{0x16b50,0x16b59},{0x16b5b,0x16b61},{0x16e80,0x16e96}, + {0x1d2e0,0x1d2f3},{0x1d360,0x1d378},{0x1d7ce,0x1d7ff},{0x1e140,0x1e149},{0x1e2f0,0x1e2f9}, + {0x1e8c7,0x1e8cf},{0x1e950,0x1e959},{0x1ec71,0x1ecab},{0x1ecad,0x1ecaf},{0x1ecb1,0x1ecb4}, + {0x1ed01,0x1ed2d},{0x1ed2f,0x1ed3d},{0x1f100,0x1f10c},{0x1f100,0x1f10c} +}; + +static const std::vector> letter_ranges = { + {0x41,0x5a},{0x61,0x7a},{0xaa,0xaa},{0xb5,0xb5},{0xba,0xba}, + {0xc0,0xd6},{0xd8,0xf6},{0xf8,0x2c1},{0x2c6,0x2d1},{0x2e0,0x2e4}, + {0x2ec,0x2ec},{0x2ee,0x2ee},{0x370,0x374},{0x376,0x377},{0x37a,0x37d}, + {0x37f,0x37f},{0x386,0x386},{0x388,0x38a},{0x38c,0x38c},{0x38e,0x3a1}, + {0x3a3,0x3f5},{0x3f7,0x481},{0x48a,0x52f},{0x531,0x556},{0x559,0x559}, + {0x560,0x588},{0x5d0,0x5ea},{0x5ef,0x5f2},{0x620,0x64a},{0x66e,0x66f}, + {0x671,0x6d3},{0x6d5,0x6d5},{0x6e5,0x6e6},{0x6ee,0x6ef},{0x6fa,0x6fc}, + {0x6ff,0x6ff},{0x710,0x710},{0x712,0x72f},{0x74d,0x7a5},{0x7b1,0x7b1}, + {0x7ca,0x7ea},{0x7f4,0x7f5},{0x7fa,0x7fa},{0x800,0x815},{0x81a,0x81a}, + {0x824,0x824},{0x828,0x828},{0x840,0x858},{0x860,0x86a},{0x870,0x887}, + {0x889,0x88e},{0x8a0,0x8c9},{0x904,0x939},{0x93d,0x93d},{0x950,0x950}, + {0x958,0x961},{0x971,0x980},{0x985,0x98c},{0x98f,0x990},{0x993,0x9a8}, + {0x9aa,0x9b0},{0x9b2,0x9b2},{0x9b6,0x9b9},{0x9bd,0x9bd},{0x9ce,0x9ce}, + {0x9dc,0x9dd},{0x9df,0x9e1},{0x9f0,0x9f1},{0x9fc,0x9fc},{0xa05,0xa0a}, + {0xa0f,0xa10},{0xa13,0xa28},{0xa2a,0xa30},{0xa32,0xa33},{0xa35,0xa36}, + {0xa38,0xa39},{0xa59,0xa5c},{0xa5e,0xa5e},{0xa72,0xa74},{0xa85,0xa8d}, + {0xa8f,0xa91},{0xa93,0xaa8},{0xaaa,0xab0},{0xab2,0xab3},{0xab5,0xab9}, + {0xabd,0xabd},{0xad0,0xad0},{0xae0,0xae1},{0xaf9,0xaf9},{0xb05,0xb0c}, + {0xb0f,0xb10},{0xb13,0xb28},{0xb2a,0xb30},{0xb32,0xb33},{0xb35,0xb39}, + {0xb3d,0xb3d},{0xb5c,0xb5d},{0xb5f,0xb61},{0xb71,0xb71},{0xb83,0xb83}, + {0xb85,0xb8a},{0xb8e,0xb90},{0xb92,0xb95},{0xb99,0xb9a},{0xb9c,0xb9c}, + {0xb9e,0xb9f},{0xba3,0xba4},{0xba8,0xbaa},{0xbae,0xbb9},{0xbd0,0xbd0}, + {0xc05,0xc0c},{0xc0e,0xc10},{0xc12,0xc28},{0xc2a,0xc39},{0xc3d,0xc3d}, + {0xc58,0xc5a},{0xc5d,0xc5d},{0xc60,0xc61},{0xc80,0xc80},{0xc85,0xc8c}, + {0xc8e,0xc90},{0xc92,0xca8},{0xcaa,0xcb3},{0xcb5,0xcb9},{0xcbd,0xcbd}, + {0xcdd,0xcde},{0xce0,0xce1},{0xcf1,0xcf2},{0xd04,0xd0c},{0xd0e,0xd10}, + {0xd12,0xd3a},{0xd3d,0xd3d},{0xd4e,0xd4e},{0xd54,0xd56},{0xd5f,0xd61}, + {0xd7a,0xd7f},{0xd85,0xd96},{0xd9a,0xdb1},{0xdb3,0xdbb},{0xdbd,0xdbd}, + {0xdc0,0xdc6},{0xe01,0xe30},{0xe32,0xe33},{0xe40,0xe46},{0xe81,0xe82}, + {0xe84,0xe84},{0xe86,0xe8a},{0xe8c,0xea3},{0xea5,0xea5},{0xea7,0xeb0}, + {0xeb2,0xeb3},{0xebd,0xebd},{0xec0,0xec4},{0xec6,0xec6},{0xedc,0xedf}, + {0xf00,0xf00},{0xf40,0xf47},{0xf49,0xf6c},{0xf88,0xf8c},{0x1000,0x102a}, + {0x103f,0x103f},{0x1050,0x1055},{0x105a,0x105d},{0x1061,0x1061},{0x1065,0x1066}, + {0x106e,0x1070},{0x1075,0x1081},{0x108e,0x108e},{0x10a0,0x10c5},{0x10c7,0x10c7}, + {0x10cd,0x10cd},{0x10d0,0x10fa},{0x10fc,0x1248},{0x124a,0x124d},{0x1250,0x1256}, + {0x1258,0x1258},{0x125a,0x125d},{0x1260,0x1288},{0x128a,0x128d},{0x1290,0x12b0}, + {0x12b2,0x12b5},{0x12b8,0x12be},{0x12c0,0x12c0},{0x12c2,0x12c5},{0x12c8,0x12d6}, + {0x12d8,0x1310},{0x1312,0x1315},{0x1318,0x135a},{0x1380,0x138f},{0x13a0,0x13f5}, + {0x13f8,0x13fd},{0x1401,0x166c},{0x166f,0x167f},{0x1681,0x169a},{0x16a0,0x16ea}, + {0x16f1,0x16f8},{0x1700,0x1711},{0x171f,0x1731},{0x1740,0x1751},{0x1760,0x176c}, + {0x176e,0x1770},{0x1780,0x17b3},{0x17d7,0x17d7},{0x17dc,0x17dc},{0x1820,0x1878}, + {0x1880,0x1884},{0x1887,0x18a8},{0x18aa,0x18aa},{0x18b0,0x18f5},{0x1900,0x191e}, + {0x1950,0x196d},{0x1970,0x1974},{0x1980,0x19ab},{0x19b0,0x19c9},{0x1a00,0x1a16}, + {0x1a20,0x1a54},{0x1aa7,0x1aa7},{0x1b05,0x1b33},{0x1b45,0x1b4c},{0x1b83,0x1ba0}, + {0x1bae,0x1baf},{0x1bba,0x1be5},{0x1c00,0x1c23},{0x1c4d,0x1c4f},{0x1c5a,0x1c7d}, + {0x1c80,0x1c88},{0x1c90,0x1cba},{0x1cbd,0x1cbf},{0x1ce9,0x1cec},{0x1cee,0x1cf3}, + {0x1cf5,0x1cf6},{0x1cfa,0x1cfa},{0x1d00,0x1dbf},{0x1e00,0x1f15},{0x1f18,0x1f1d}, + {0x1f20,0x1f45},{0x1f48,0x1f4d},{0x1f50,0x1f57},{0x1f59,0x1f59},{0x1f5b,0x1f5b}, + {0x1f5d,0x1f5d},{0x1f5f,0x1f7d},{0x1f80,0x1fb4},{0x1fb6,0x1fbc},{0x1fbe,0x1fbe}, + {0x1fc2,0x1fc4},{0x1fc6,0x1fcc},{0x1fd0,0x1fd3},{0x1fd6,0x1fdb},{0x1fe0,0x1fec}, + {0x1ff2,0x1ff4},{0x1ff6,0x1ffc},{0x2071,0x2071},{0x207f,0x207f},{0x2090,0x209c}, + {0x2102,0x2102},{0x2107,0x2107},{0x210a,0x2113},{0x2115,0x2115},{0x2119,0x211d}, + {0x2124,0x2124},{0x2126,0x2126},{0x2128,0x2128},{0x212a,0x212d},{0x212f,0x2139}, + {0x213c,0x213f},{0x2145,0x2149},{0x214e,0x214e},{0x2183,0x2184},{0x2c00,0x2ce4}, + {0x2ceb,0x2cee},{0x2cf2,0x2cf3},{0x2d00,0x2d25},{0x2d27,0x2d27},{0x2d2d,0x2d2d}, + {0x2d30,0x2d67},{0x2d6f,0x2d6f},{0x2d80,0x2d96},{0x2da0,0x2da6},{0x2da8,0x2dae}, + {0x2db0,0x2db6},{0x2db8,0x2dbe},{0x2dc0,0x2dc6},{0x2dc8,0x2dce},{0x2dd0,0x2dd6}, + {0x2dd8,0x2dde},{0x2e2f,0x2e2f},{0x3005,0x3006},{0x3031,0x3035},{0x303b,0x303c}, + {0x3041,0x3096},{0x309d,0x309f},{0x30a1,0x30fa},{0x30fc,0x30ff},{0x3105,0x312f}, + {0x3131,0x318e},{0x31a0,0x31bf},{0x31f0,0x31ff},{0x3400,0x4dbf},{0x4e00,0xa48c}, + {0xa4d0,0xa4fd},{0xa500,0xa60c},{0xa610,0xa61f},{0xa62a,0xa62b},{0xa640,0xa66e}, + {0xa67f,0xa69d},{0xa6a0,0xa6e5},{0xa717,0xa71f},{0xa722,0xa788},{0xa78b,0xa7ca}, + {0xa7d0,0xa7d1},{0xa7d3,0xa7d3},{0xa7d5,0xa7d9},{0xa7f2,0xa801},{0xa803,0xa805}, + {0xa807,0xa80a},{0xa80c,0xa822},{0xa840,0xa873},{0xa882,0xa8b3},{0xa8f2,0xa8f7}, + {0xa8fb,0xa8fb},{0xa8fd,0xa8fe},{0xa90a,0xa925},{0xa930,0xa946},{0xa960,0xa97c}, + {0xa984,0xa9b2},{0xa9cf,0xa9cf},{0xa9e0,0xa9e4},{0xa9e6,0xa9ef},{0xa9fa,0xa9fe}, + {0xaa00,0xaa28},{0xaa40,0xaa42},{0xaa44,0xaa4b},{0xaa60,0xaa76},{0xaa7a,0xaa7a}, + {0xaa7e,0xaaaf},{0xaab1,0xaab1},{0xaab5,0xaab6},{0xaab9,0xaabd},{0xaac0,0xaac0}, + {0xaac2,0xaac2},{0xaadb,0xaadd},{0xaae0,0xaaea},{0xaaf2,0xaaf4},{0xab01,0xab06}, + {0xab09,0xab0e},{0xab11,0xab16},{0xab20,0xab26},{0xab28,0xab2e},{0xab30,0xab5a}, + {0xab5c,0xab69},{0xab70,0xabe2},{0xac00,0xd7a3},{0xd7b0,0xd7c6},{0xd7cb,0xd7fb}, + {0xf900,0xfa6d},{0xfa70,0xfad9},{0xfb00,0xfb06},{0xfb13,0xfb17},{0xfb1d,0xfb1d}, + {0xfb1f,0xfb28},{0xfb2a,0xfb36},{0xfb38,0xfb3c},{0xfb3e,0xfb3e},{0xfb40,0xfb41}, + {0xfb43,0xfb44},{0xfb46,0xfbb1},{0xfbd3,0xfd3d},{0xfd50,0xfd8f},{0xfd92,0xfdc7}, + {0xfdf0,0xfdfb},{0xfe70,0xfe74},{0xfe76,0xfefc},{0xff21,0xff3a},{0xff41,0xff5a}, + {0xff66,0xffbe},{0xffc2,0xffc7},{0xffca,0xffcf},{0xffd2,0xffd7},{0xffda,0xffdc}, + {0x10000,0x1000b},{0x1000d,0x10026},{0x10028,0x1003a},{0x1003c,0x1003d},{0x1003f,0x1004d}, + {0x10050,0x1005d},{0x10080,0x100fa},{0x10280,0x1029c},{0x102a0,0x102d0},{0x10300,0x1031f}, + {0x1032d,0x10340},{0x10342,0x10349},{0x10350,0x10375},{0x10380,0x1039d},{0x103a0,0x103c3}, + {0x103c8,0x103cf},{0x10400,0x1049d},{0x104b0,0x104d3},{0x104d8,0x104fb},{0x10500,0x10527}, + {0x10530,0x10563},{0x10570,0x1057a},{0x1057c,0x1058a},{0x1058c,0x10592},{0x10594,0x10595}, + {0x10597,0x105a1},{0x105a3,0x105b1},{0x105b3,0x105b9},{0x105bb,0x105bc},{0x10600,0x10736}, + {0x10740,0x10755},{0x10760,0x10767},{0x10780,0x10785},{0x10787,0x107b0},{0x107b2,0x107ba}, + {0x10800,0x10805},{0x10808,0x10808},{0x1080a,0x10835},{0x10837,0x10838},{0x1083c,0x1083c}, + {0x1083f,0x10855},{0x10860,0x10876},{0x10880,0x1089e},{0x108e0,0x108f2},{0x108f4,0x108f5}, + {0x10900,0x10915},{0x10920,0x10939},{0x10980,0x109b7},{0x109be,0x109bf},{0x10a00,0x10a00}, + {0x10a10,0x10a13},{0x10a15,0x10a17},{0x10a19,0x10a35},{0x10a60,0x10a7c},{0x10a80,0x10a9c}, + {0x10ac0,0x10ac7},{0x10ac9,0x10ae4},{0x10b00,0x10b35},{0x10b40,0x10b55},{0x10b60,0x10b72}, + {0x10b80,0x10b91},{0x10c00,0x10c48},{0x10c80,0x10cb2},{0x10cc0,0x10cf2},{0x10d00,0x10d23}, + {0x10e80,0x10ea9},{0x10eb0,0x10eb1},{0x10f00,0x10f1c},{0x10f27,0x10f27},{0x10f30,0x10f45}, + {0x10f70,0x10f81},{0x10fb0,0x10fc4},{0x10fe0,0x10ff6},{0x11003,0x11037},{0x11071,0x11072}, + {0x11075,0x11075},{0x11083,0x110af},{0x110d0,0x110e8},{0x11103,0x11126},{0x11144,0x11144}, + {0x11147,0x11147},{0x11150,0x11172},{0x11176,0x11176},{0x11183,0x111b2},{0x111c1,0x111c4}, + {0x111da,0x111da},{0x111dc,0x111dc},{0x11200,0x11211},{0x11213,0x1122b},{0x11280,0x11286}, + {0x11288,0x11288},{0x1128a,0x1128d},{0x1128f,0x1129d},{0x1129f,0x112a8},{0x112b0,0x112de}, + {0x11305,0x1130c},{0x1130f,0x11310},{0x11313,0x11328},{0x1132a,0x11330},{0x11332,0x11333}, + {0x11335,0x11339},{0x1133d,0x1133d},{0x11350,0x11350},{0x1135d,0x11361},{0x11400,0x11434}, + {0x11447,0x1144a},{0x1145f,0x11461},{0x11480,0x114af},{0x114c4,0x114c5},{0x114c7,0x114c7}, + {0x11580,0x115ae},{0x115d8,0x115db},{0x11600,0x1162f},{0x11644,0x11644},{0x11680,0x116aa}, + {0x116b8,0x116b8},{0x11700,0x1171a},{0x11740,0x11746},{0x11800,0x1182b},{0x118a0,0x118df}, + {0x118ff,0x11906},{0x11909,0x11909},{0x1190c,0x11913},{0x11915,0x11916},{0x11918,0x1192f}, + {0x1193f,0x1193f},{0x11941,0x11941},{0x119a0,0x119a7},{0x119aa,0x119d0},{0x119e1,0x119e1}, + {0x119e3,0x119e3},{0x11a00,0x11a00},{0x11a0b,0x11a32},{0x11a3a,0x11a3a},{0x11a50,0x11a50}, + {0x11a5c,0x11a89},{0x11a9d,0x11a9d},{0x11ab0,0x11af8},{0x11c00,0x11c08},{0x11c0a,0x11c2e}, + {0x11c40,0x11c40},{0x11c72,0x11c8f},{0x11d00,0x11d06},{0x11d08,0x11d09},{0x11d0b,0x11d30}, + {0x11d46,0x11d46},{0x11d60,0x11d65},{0x11d67,0x11d68},{0x11d6a,0x11d89},{0x11d98,0x11d98}, + {0x11ee0,0x11ef2},{0x11fb0,0x11fb0},{0x12000,0x12399},{0x12480,0x12543},{0x12f90,0x12ff0}, + {0x13000,0x1342e},{0x14400,0x14646},{0x16800,0x16a38},{0x16a40,0x16a5e},{0x16a70,0x16abe}, + {0x16ad0,0x16aed},{0x16b00,0x16b2f},{0x16b40,0x16b43},{0x16b63,0x16b77},{0x16b7d,0x16b8f}, + {0x16e40,0x16e7f},{0x16f00,0x16f4a},{0x16f50,0x16f50},{0x16f93,0x16f9f},{0x16fe0,0x16fe1}, + {0x16fe3,0x16fe3},{0x17000,0x187f7},{0x18800,0x18cd5},{0x18d00,0x18d08},{0x1aff0,0x1aff3}, + {0x1aff5,0x1affb},{0x1affd,0x1affe},{0x1b000,0x1b122},{0x1b150,0x1b152},{0x1b164,0x1b167}, + {0x1b170,0x1b2fb},{0x1bc00,0x1bc6a},{0x1bc70,0x1bc7c},{0x1bc80,0x1bc88},{0x1bc90,0x1bc99}, + {0x1d400,0x1d454},{0x1d456,0x1d49c},{0x1d49e,0x1d49f},{0x1d4a2,0x1d4a2},{0x1d4a5,0x1d4a6}, + {0x1d4a9,0x1d4ac},{0x1d4ae,0x1d4b9},{0x1d4bb,0x1d4bb},{0x1d4bd,0x1d4c3},{0x1d4c5,0x1d505}, + {0x1d507,0x1d50a},{0x1d50d,0x1d514},{0x1d516,0x1d51c},{0x1d51e,0x1d539},{0x1d53b,0x1d53e}, + {0x1d540,0x1d544},{0x1d546,0x1d546},{0x1d54a,0x1d550},{0x1d552,0x1d6a5},{0x1d6a8,0x1d6c0}, + {0x1d6c2,0x1d6da},{0x1d6dc,0x1d6fa},{0x1d6fc,0x1d714},{0x1d716,0x1d734},{0x1d736,0x1d74e}, + {0x1d750,0x1d76e},{0x1d770,0x1d788},{0x1d78a,0x1d7a8},{0x1d7aa,0x1d7c2},{0x1d7c4,0x1d7cb}, + {0x1df00,0x1df1e},{0x1e100,0x1e12c},{0x1e137,0x1e13d},{0x1e14e,0x1e14e},{0x1e290,0x1e2ad}, + {0x1e2c0,0x1e2eb},{0x1e7e0,0x1e7e6},{0x1e7e8,0x1e7eb},{0x1e7ed,0x1e7ee},{0x1e7f0,0x1e7fe}, + {0x1e800,0x1e8c4},{0x1e900,0x1e943},{0x1e94b,0x1e94b},{0x1ee00,0x1ee03},{0x1ee05,0x1ee1f}, + {0x1ee21,0x1ee22},{0x1ee24,0x1ee24},{0x1ee27,0x1ee27},{0x1ee29,0x1ee32},{0x1ee34,0x1ee37}, + {0x1ee39,0x1ee39},{0x1ee3b,0x1ee3b},{0x1ee42,0x1ee42},{0x1ee47,0x1ee47},{0x1ee49,0x1ee49}, + {0x1ee4b,0x1ee4b},{0x1ee4d,0x1ee4f},{0x1ee51,0x1ee52},{0x1ee54,0x1ee54},{0x1ee57,0x1ee57}, + {0x1ee59,0x1ee59},{0x1ee5b,0x1ee5b},{0x1ee5d,0x1ee5d},{0x1ee5f,0x1ee5f},{0x1ee61,0x1ee62}, + {0x1ee64,0x1ee64},{0x1ee67,0x1ee6a},{0x1ee6c,0x1ee72},{0x1ee74,0x1ee77},{0x1ee79,0x1ee7c}, + {0x1ee7e,0x1ee7e},{0x1ee80,0x1ee89},{0x1ee8b,0x1ee9b},{0x1eea1,0x1eea3},{0x1eea5,0x1eea9}, + {0x1eeab,0x1eebb},{0x20000,0x2a6df},{0x2a700,0x2b738},{0x2b740,0x2b81d},{0x2b820,0x2cea1}, + {0x2ceb0,0x2ebe0},{0x2f800,0x2fa1d},{0x2f800,0x2fa1d} +}; + +static const std::vector> punctuation_ranges = { + {0x21,0x23},{0x25,0x2a},{0x2c,0x2f},{0x3a,0x3b},{0x3f,0x40}, + {0x5b,0x5d},{0x5f,0x5f},{0x7b,0x7b},{0x7d,0x7d},{0xa1,0xa1}, + {0xa7,0xa7},{0xab,0xab},{0xb6,0xb7},{0xbb,0xbb},{0xbf,0xbf}, + {0x37e,0x37e},{0x387,0x387},{0x55a,0x55f},{0x589,0x58a},{0x5be,0x5be}, + {0x5c0,0x5c0},{0x5c3,0x5c3},{0x5c6,0x5c6},{0x5f3,0x5f4},{0x609,0x60a}, + {0x60c,0x60d},{0x61b,0x61b},{0x61d,0x61f},{0x66a,0x66d},{0x6d4,0x6d4}, + {0x700,0x70d},{0x7f7,0x7f9},{0x830,0x83e},{0x85e,0x85e},{0x964,0x965}, + {0x970,0x970},{0x9fd,0x9fd},{0xa76,0xa76},{0xaf0,0xaf0},{0xc77,0xc77}, + {0xc84,0xc84},{0xdf4,0xdf4},{0xe4f,0xe4f},{0xe5a,0xe5b},{0xf04,0xf12}, + {0xf14,0xf14},{0xf3a,0xf3d},{0xf85,0xf85},{0xfd0,0xfd4},{0xfd9,0xfda}, + {0x104a,0x104f},{0x10fb,0x10fb},{0x1360,0x1368},{0x1400,0x1400},{0x166e,0x166e}, + {0x169b,0x169c},{0x16eb,0x16ed},{0x1735,0x1736},{0x17d4,0x17d6},{0x17d8,0x17da}, + {0x1800,0x180a},{0x1944,0x1945},{0x1a1e,0x1a1f},{0x1aa0,0x1aa6},{0x1aa8,0x1aad}, + {0x1b5a,0x1b60},{0x1b7d,0x1b7e},{0x1bfc,0x1bff},{0x1c3b,0x1c3f},{0x1c7e,0x1c7f}, + {0x1cc0,0x1cc7},{0x1cd3,0x1cd3},{0x2010,0x2027},{0x2030,0x2043},{0x2045,0x2051}, + {0x2053,0x205e},{0x207d,0x207e},{0x208d,0x208e},{0x2308,0x230b},{0x2329,0x232a}, + {0x2768,0x2775},{0x27c5,0x27c6},{0x27e6,0x27ef},{0x2983,0x2998},{0x29d8,0x29db}, + {0x29fc,0x29fd},{0x2cf9,0x2cfc},{0x2cfe,0x2cff},{0x2d70,0x2d70},{0x2e00,0x2e2e}, + {0x2e30,0x2e4f},{0x2e52,0x2e5d},{0x3001,0x3003},{0x3008,0x3011},{0x3014,0x301f}, + {0x3030,0x3030},{0x303d,0x303d},{0x30a0,0x30a0},{0x30fb,0x30fb},{0xa4fe,0xa4ff}, + {0xa60d,0xa60f},{0xa673,0xa673},{0xa67e,0xa67e},{0xa6f2,0xa6f7},{0xa874,0xa877}, + {0xa8ce,0xa8cf},{0xa8f8,0xa8fa},{0xa8fc,0xa8fc},{0xa92e,0xa92f},{0xa95f,0xa95f}, + {0xa9c1,0xa9cd},{0xa9de,0xa9df},{0xaa5c,0xaa5f},{0xaade,0xaadf},{0xaaf0,0xaaf1}, + {0xabeb,0xabeb},{0xfd3e,0xfd3f},{0xfe10,0xfe19},{0xfe30,0xfe52},{0xfe54,0xfe61}, + {0xfe63,0xfe63},{0xfe68,0xfe68},{0xfe6a,0xfe6b},{0xff01,0xff03},{0xff05,0xff0a}, + {0xff0c,0xff0f},{0xff1a,0xff1b},{0xff1f,0xff20},{0xff3b,0xff3d},{0xff3f,0xff3f}, + {0xff5b,0xff5b},{0xff5d,0xff5d},{0xff5f,0xff65},{0x10100,0x10102},{0x1039f,0x1039f}, + {0x103d0,0x103d0},{0x1056f,0x1056f},{0x10857,0x10857},{0x1091f,0x1091f},{0x1093f,0x1093f}, + {0x10a50,0x10a58},{0x10a7f,0x10a7f},{0x10af0,0x10af6},{0x10b39,0x10b3f},{0x10b99,0x10b9c}, + {0x10ead,0x10ead},{0x10f55,0x10f59},{0x10f86,0x10f89},{0x11047,0x1104d},{0x110bb,0x110bc}, + {0x110be,0x110c1},{0x11140,0x11143},{0x11174,0x11175},{0x111c5,0x111c8},{0x111cd,0x111cd}, + {0x111db,0x111db},{0x111dd,0x111df},{0x11238,0x1123d},{0x112a9,0x112a9},{0x1144b,0x1144f}, + {0x1145a,0x1145b},{0x1145d,0x1145d},{0x114c6,0x114c6},{0x115c1,0x115d7},{0x11641,0x11643}, + {0x11660,0x1166c},{0x116b9,0x116b9},{0x1173c,0x1173e},{0x1183b,0x1183b},{0x11944,0x11946}, + {0x119e2,0x119e2},{0x11a3f,0x11a46},{0x11a9a,0x11a9c},{0x11a9e,0x11aa2},{0x11c41,0x11c45}, + {0x11c70,0x11c71},{0x11ef7,0x11ef8},{0x11fff,0x11fff},{0x12470,0x12474},{0x12ff1,0x12ff2}, + {0x16a6e,0x16a6f},{0x16af5,0x16af5},{0x16b37,0x16b3b},{0x16b44,0x16b44},{0x16e97,0x16e9a}, + {0x16fe2,0x16fe2},{0x1bc9f,0x1bc9f},{0x1da87,0x1da8b},{0x1da87,0x1da8b} +}; + +static const std::vector> separator_ranges = { + {0x20,0x20},{0xa0,0xa0},{0x1680,0x1680},{0x2000,0x200a},{0x2028,0x2029}, + {0x202f,0x202f},{0x205f,0x205f},{0x205f,0x205f} +}; + +static const std::vector> mark_ranges = { + {0x300,0x36f},{0x483,0x489},{0x591,0x5bd},{0x5bf,0x5bf},{0x5c1,0x5c2}, + {0x5c4,0x5c5},{0x5c7,0x5c7},{0x610,0x61a},{0x64b,0x65f},{0x670,0x670}, + {0x6d6,0x6dc},{0x6df,0x6e4},{0x6e7,0x6e8},{0x6ea,0x6ed},{0x711,0x711}, + {0x730,0x74a},{0x7a6,0x7b0},{0x7eb,0x7f3},{0x7fd,0x7fd},{0x816,0x819}, + {0x81b,0x823},{0x825,0x827},{0x829,0x82d},{0x859,0x85b},{0x898,0x89f}, + {0x8ca,0x8e1},{0x8e3,0x903},{0x93a,0x93c},{0x93e,0x94f},{0x951,0x957}, + {0x962,0x963},{0x981,0x983},{0x9bc,0x9bc},{0x9be,0x9c4},{0x9c7,0x9c8}, + {0x9cb,0x9cd},{0x9d7,0x9d7},{0x9e2,0x9e3},{0x9fe,0x9fe},{0xa01,0xa03}, + {0xa3c,0xa3c},{0xa3e,0xa42},{0xa47,0xa48},{0xa4b,0xa4d},{0xa51,0xa51}, + {0xa70,0xa71},{0xa75,0xa75},{0xa81,0xa83},{0xabc,0xabc},{0xabe,0xac5}, + {0xac7,0xac9},{0xacb,0xacd},{0xae2,0xae3},{0xafa,0xaff},{0xb01,0xb03}, + {0xb3c,0xb3c},{0xb3e,0xb44},{0xb47,0xb48},{0xb4b,0xb4d},{0xb55,0xb57}, + {0xb62,0xb63},{0xb82,0xb82},{0xbbe,0xbc2},{0xbc6,0xbc8},{0xbca,0xbcd}, + {0xbd7,0xbd7},{0xc00,0xc04},{0xc3c,0xc3c},{0xc3e,0xc44},{0xc46,0xc48}, + {0xc4a,0xc4d},{0xc55,0xc56},{0xc62,0xc63},{0xc81,0xc83},{0xcbc,0xcbc}, + {0xcbe,0xcc4},{0xcc6,0xcc8},{0xcca,0xccd},{0xcd5,0xcd6},{0xce2,0xce3}, + {0xd00,0xd03},{0xd3b,0xd3c},{0xd3e,0xd44},{0xd46,0xd48},{0xd4a,0xd4d}, + {0xd57,0xd57},{0xd62,0xd63},{0xd81,0xd83},{0xdca,0xdca},{0xdcf,0xdd4}, + {0xdd6,0xdd6},{0xdd8,0xddf},{0xdf2,0xdf3},{0xe31,0xe31},{0xe34,0xe3a}, + {0xe47,0xe4e},{0xeb1,0xeb1},{0xeb4,0xebc},{0xec8,0xecd},{0xf18,0xf19}, + {0xf35,0xf35},{0xf37,0xf37},{0xf39,0xf39},{0xf3e,0xf3f},{0xf71,0xf84}, + {0xf86,0xf87},{0xf8d,0xf97},{0xf99,0xfbc},{0xfc6,0xfc6},{0x102b,0x103e}, + {0x1056,0x1059},{0x105e,0x1060},{0x1062,0x1064},{0x1067,0x106d},{0x1071,0x1074}, + {0x1082,0x108d},{0x108f,0x108f},{0x109a,0x109d},{0x135d,0x135f},{0x1712,0x1715}, + {0x1732,0x1734},{0x1752,0x1753},{0x1772,0x1773},{0x17b4,0x17d3},{0x17dd,0x17dd}, + {0x180b,0x180d},{0x180f,0x180f},{0x1885,0x1886},{0x18a9,0x18a9},{0x1920,0x192b}, + {0x1930,0x193b},{0x1a17,0x1a1b},{0x1a55,0x1a5e},{0x1a60,0x1a7c},{0x1a7f,0x1a7f}, + {0x1ab0,0x1ace},{0x1b00,0x1b04},{0x1b34,0x1b44},{0x1b6b,0x1b73},{0x1b80,0x1b82}, + {0x1ba1,0x1bad},{0x1be6,0x1bf3},{0x1c24,0x1c37},{0x1cd0,0x1cd2},{0x1cd4,0x1ce8}, + {0x1ced,0x1ced},{0x1cf4,0x1cf4},{0x1cf7,0x1cf9},{0x1dc0,0x1dff},{0x20d0,0x20f0}, + {0x2cef,0x2cf1},{0x2d7f,0x2d7f},{0x2de0,0x2dff},{0x302a,0x302f},{0x3099,0x309a}, + {0xa66f,0xa672},{0xa674,0xa67d},{0xa69e,0xa69f},{0xa6f0,0xa6f1},{0xa802,0xa802}, + {0xa806,0xa806},{0xa80b,0xa80b},{0xa823,0xa827},{0xa82c,0xa82c},{0xa880,0xa881}, + {0xa8b4,0xa8c5},{0xa8e0,0xa8f1},{0xa8ff,0xa8ff},{0xa926,0xa92d},{0xa947,0xa953}, + {0xa980,0xa983},{0xa9b3,0xa9c0},{0xa9e5,0xa9e5},{0xaa29,0xaa36},{0xaa43,0xaa43}, + {0xaa4c,0xaa4d},{0xaa7b,0xaa7d},{0xaab0,0xaab0},{0xaab2,0xaab4},{0xaab7,0xaab8}, + {0xaabe,0xaabf},{0xaac1,0xaac1},{0xaaeb,0xaaef},{0xaaf5,0xaaf6},{0xabe3,0xabea}, + {0xabec,0xabed},{0xfb1e,0xfb1e},{0xfe00,0xfe0f},{0xfe20,0xfe2f},{0x101fd,0x101fd}, + {0x102e0,0x102e0},{0x10376,0x1037a},{0x10a01,0x10a03},{0x10a05,0x10a06},{0x10a0c,0x10a0f}, + {0x10a38,0x10a3a},{0x10a3f,0x10a3f},{0x10ae5,0x10ae6},{0x10d24,0x10d27},{0x10eab,0x10eac}, + {0x10f46,0x10f50},{0x10f82,0x10f85},{0x11000,0x11002},{0x11038,0x11046},{0x11070,0x11070}, + {0x11073,0x11074},{0x1107f,0x11082},{0x110b0,0x110ba},{0x110c2,0x110c2},{0x11100,0x11102}, + {0x11127,0x11134},{0x11145,0x11146},{0x11173,0x11173},{0x11180,0x11182},{0x111b3,0x111c0}, + {0x111c9,0x111cc},{0x111ce,0x111cf},{0x1122c,0x11237},{0x1123e,0x1123e},{0x112df,0x112ea}, + {0x11300,0x11303},{0x1133b,0x1133c},{0x1133e,0x11344},{0x11347,0x11348},{0x1134b,0x1134d}, + {0x11357,0x11357},{0x11362,0x11363},{0x11366,0x1136c},{0x11370,0x11374},{0x11435,0x11446}, + {0x1145e,0x1145e},{0x114b0,0x114c3},{0x115af,0x115b5},{0x115b8,0x115c0},{0x115dc,0x115dd}, + {0x11630,0x11640},{0x116ab,0x116b7},{0x1171d,0x1172b},{0x1182c,0x1183a},{0x11930,0x11935}, + {0x11937,0x11938},{0x1193b,0x1193e},{0x11940,0x11940},{0x11942,0x11943},{0x119d1,0x119d7}, + {0x119da,0x119e0},{0x119e4,0x119e4},{0x11a01,0x11a0a},{0x11a33,0x11a39},{0x11a3b,0x11a3e}, + {0x11a47,0x11a47},{0x11a51,0x11a5b},{0x11a8a,0x11a99},{0x11c2f,0x11c36},{0x11c38,0x11c3f}, + {0x11c92,0x11ca7},{0x11ca9,0x11cb6},{0x11d31,0x11d36},{0x11d3a,0x11d3a},{0x11d3c,0x11d3d}, + {0x11d3f,0x11d45},{0x11d47,0x11d47},{0x11d8a,0x11d8e},{0x11d90,0x11d91},{0x11d93,0x11d97}, + {0x11ef3,0x11ef6},{0x16af0,0x16af4},{0x16b30,0x16b36},{0x16f4f,0x16f4f},{0x16f51,0x16f87}, + {0x16f8f,0x16f92},{0x16fe4,0x16fe4},{0x16ff0,0x16ff1},{0x1bc9d,0x1bc9e},{0x1cf00,0x1cf2d}, + {0x1cf30,0x1cf46},{0x1d165,0x1d169},{0x1d16d,0x1d172},{0x1d17b,0x1d182},{0x1d185,0x1d18b}, + {0x1d1aa,0x1d1ad},{0x1d242,0x1d244},{0x1da00,0x1da36},{0x1da3b,0x1da6c},{0x1da75,0x1da75}, + {0x1da84,0x1da84},{0x1da9b,0x1da9f},{0x1daa1,0x1daaf},{0x1e000,0x1e006},{0x1e008,0x1e018}, + {0x1e01b,0x1e021},{0x1e023,0x1e024},{0x1e026,0x1e02a},{0x1e130,0x1e136},{0x1e2ae,0x1e2ae}, + {0x1e2ec,0x1e2ef},{0x1e8d0,0x1e8d6},{0x1e944,0x1e94a},{0x1e944,0x1e94a} +}; + +static const std::vector> symbol_ranges = { + {0x24,0x24},{0x2b,0x2b},{0x3c,0x3e},{0x5e,0x5e},{0x60,0x60}, + {0x7c,0x7c},{0x7e,0x7e},{0xa2,0xa6},{0xa8,0xa9},{0xac,0xac}, + {0xae,0xb1},{0xb4,0xb4},{0xb8,0xb8},{0xd7,0xd7},{0xf7,0xf7}, + {0x2c2,0x2c5},{0x2d2,0x2df},{0x2e5,0x2eb},{0x2ed,0x2ed},{0x2ef,0x2ff}, + {0x375,0x375},{0x384,0x385},{0x3f6,0x3f6},{0x482,0x482},{0x58d,0x58f}, + {0x606,0x608},{0x60b,0x60b},{0x60e,0x60f},{0x6de,0x6de},{0x6e9,0x6e9}, + {0x6fd,0x6fe},{0x7f6,0x7f6},{0x7fe,0x7ff},{0x888,0x888},{0x9f2,0x9f3}, + {0x9fa,0x9fb},{0xaf1,0xaf1},{0xb70,0xb70},{0xbf3,0xbfa},{0xc7f,0xc7f}, + {0xd4f,0xd4f},{0xd79,0xd79},{0xe3f,0xe3f},{0xf01,0xf03},{0xf13,0xf13}, + {0xf15,0xf17},{0xf1a,0xf1f},{0xf34,0xf34},{0xf36,0xf36},{0xf38,0xf38}, + {0xfbe,0xfc5},{0xfc7,0xfcc},{0xfce,0xfcf},{0xfd5,0xfd8},{0x109e,0x109f}, + {0x1390,0x1399},{0x166d,0x166d},{0x17db,0x17db},{0x1940,0x1940},{0x19de,0x19ff}, + {0x1b61,0x1b6a},{0x1b74,0x1b7c},{0x1fbd,0x1fbd},{0x1fbf,0x1fc1},{0x1fcd,0x1fcf}, + {0x1fdd,0x1fdf},{0x1fed,0x1fef},{0x1ffd,0x1ffe},{0x2044,0x2044},{0x2052,0x2052}, + {0x207a,0x207c},{0x208a,0x208c},{0x20a0,0x20c0},{0x2100,0x2101},{0x2103,0x2106}, + {0x2108,0x2109},{0x2114,0x2114},{0x2116,0x2118},{0x211e,0x2123},{0x2125,0x2125}, + {0x2127,0x2127},{0x2129,0x2129},{0x212e,0x212e},{0x213a,0x213b},{0x2140,0x2144}, + {0x214a,0x214d},{0x214f,0x214f},{0x218a,0x218b},{0x2190,0x2307},{0x230c,0x2328}, + {0x232b,0x2426},{0x2440,0x244a},{0x249c,0x24e9},{0x2500,0x2767},{0x2794,0x27c4}, + {0x27c7,0x27e5},{0x27f0,0x2982},{0x2999,0x29d7},{0x29dc,0x29fb},{0x29fe,0x2b73}, + {0x2b76,0x2b95},{0x2b97,0x2bff},{0x2ce5,0x2cea},{0x2e50,0x2e51},{0x2e80,0x2e99}, + {0x2e9b,0x2ef3},{0x2f00,0x2fd5},{0x2ff0,0x2ffb},{0x3004,0x3004},{0x3012,0x3013}, + {0x3020,0x3020},{0x3036,0x3037},{0x303e,0x303f},{0x309b,0x309c},{0x3190,0x3191}, + {0x3196,0x319f},{0x31c0,0x31e3},{0x3200,0x321e},{0x322a,0x3247},{0x3250,0x3250}, + {0x3260,0x327f},{0x328a,0x32b0},{0x32c0,0x33ff},{0x4dc0,0x4dff},{0xa490,0xa4c6}, + {0xa700,0xa716},{0xa720,0xa721},{0xa789,0xa78a},{0xa828,0xa82b},{0xa836,0xa839}, + {0xaa77,0xaa79},{0xab5b,0xab5b},{0xab6a,0xab6b},{0xfb29,0xfb29},{0xfbb2,0xfbc2}, + {0xfd40,0xfd4f},{0xfdcf,0xfdcf},{0xfdfc,0xfdff},{0xfe62,0xfe62},{0xfe64,0xfe66}, + {0xfe69,0xfe69},{0xff04,0xff04},{0xff0b,0xff0b},{0xff1c,0xff1e},{0xff3e,0xff3e}, + {0xff40,0xff40},{0xff5c,0xff5c},{0xff5e,0xff5e},{0xffe0,0xffe6},{0xffe8,0xffee}, + {0xfffc,0xfffd},{0x10137,0x1013f},{0x10179,0x10189},{0x1018c,0x1018e},{0x10190,0x1019c}, + {0x101a0,0x101a0},{0x101d0,0x101fc},{0x10877,0x10878},{0x10ac8,0x10ac8},{0x1173f,0x1173f}, + {0x11fd5,0x11ff1},{0x16b3c,0x16b3f},{0x16b45,0x16b45},{0x1bc9c,0x1bc9c},{0x1cf50,0x1cfc3}, + {0x1d000,0x1d0f5},{0x1d100,0x1d126},{0x1d129,0x1d164},{0x1d16a,0x1d16c},{0x1d183,0x1d184}, + {0x1d18c,0x1d1a9},{0x1d1ae,0x1d1ea},{0x1d200,0x1d241},{0x1d245,0x1d245},{0x1d300,0x1d356}, + {0x1d6c1,0x1d6c1},{0x1d6db,0x1d6db},{0x1d6fb,0x1d6fb},{0x1d715,0x1d715},{0x1d735,0x1d735}, + {0x1d74f,0x1d74f},{0x1d76f,0x1d76f},{0x1d789,0x1d789},{0x1d7a9,0x1d7a9},{0x1d7c3,0x1d7c3}, + {0x1d800,0x1d9ff},{0x1da37,0x1da3a},{0x1da6d,0x1da74},{0x1da76,0x1da83},{0x1da85,0x1da86}, + {0x1e14f,0x1e14f},{0x1e2ff,0x1e2ff},{0x1ecac,0x1ecac},{0x1ecb0,0x1ecb0},{0x1ed2e,0x1ed2e}, + {0x1eef0,0x1eef1},{0x1f000,0x1f02b},{0x1f030,0x1f093},{0x1f0a0,0x1f0ae},{0x1f0b1,0x1f0bf}, + {0x1f0c1,0x1f0cf},{0x1f0d1,0x1f0f5},{0x1f10d,0x1f1ad},{0x1f1e6,0x1f202},{0x1f210,0x1f23b}, + {0x1f240,0x1f248},{0x1f250,0x1f251},{0x1f260,0x1f265},{0x1f300,0x1f6d7},{0x1f6dd,0x1f6ec}, + {0x1f6f0,0x1f6fc},{0x1f700,0x1f773},{0x1f780,0x1f7d8},{0x1f7e0,0x1f7eb},{0x1f7f0,0x1f7f0}, + {0x1f800,0x1f80b},{0x1f810,0x1f847},{0x1f850,0x1f859},{0x1f860,0x1f887},{0x1f890,0x1f8ad}, + {0x1f8b0,0x1f8b1},{0x1f900,0x1fa53},{0x1fa60,0x1fa6d},{0x1fa70,0x1fa74},{0x1fa78,0x1fa7c}, + {0x1fa80,0x1fa86},{0x1fa90,0x1faac},{0x1fab0,0x1faba},{0x1fac0,0x1fac5},{0x1fad0,0x1fad9}, + {0x1fae0,0x1fae7},{0x1faf0,0x1faf6},{0x1fb00,0x1fb92},{0x1fb00,0x1fb92} +}; + +static const std::vector> other_ranges = { + {0x0,0x1f},{0x7f,0x9f},{0xad,0xad},{0x378,0x379},{0x380,0x383}, + {0x38b,0x38b},{0x38d,0x38d},{0x3a2,0x3a2},{0x530,0x530},{0x557,0x558}, + {0x58b,0x58c},{0x590,0x590},{0x5c8,0x5cf},{0x5eb,0x5ee},{0x5f5,0x605}, + {0x61c,0x61c},{0x6dd,0x6dd},{0x70e,0x70f},{0x74b,0x74c},{0x7b2,0x7bf}, + {0x7fb,0x7fc},{0x82e,0x82f},{0x83f,0x83f},{0x85c,0x85d},{0x85f,0x85f}, + {0x86b,0x86f},{0x88f,0x897},{0x8e2,0x8e2},{0x984,0x984},{0x98d,0x98e}, + {0x991,0x992},{0x9a9,0x9a9},{0x9b1,0x9b1},{0x9b3,0x9b5},{0x9ba,0x9bb}, + {0x9c5,0x9c6},{0x9c9,0x9ca},{0x9cf,0x9d6},{0x9d8,0x9db},{0x9de,0x9de}, + {0x9e4,0x9e5},{0x9ff,0xa00},{0xa04,0xa04},{0xa0b,0xa0e},{0xa11,0xa12}, + {0xa29,0xa29},{0xa31,0xa31},{0xa34,0xa34},{0xa37,0xa37},{0xa3a,0xa3b}, + {0xa3d,0xa3d},{0xa43,0xa46},{0xa49,0xa4a},{0xa4e,0xa50},{0xa52,0xa58}, + {0xa5d,0xa5d},{0xa5f,0xa65},{0xa77,0xa80},{0xa84,0xa84},{0xa8e,0xa8e}, + {0xa92,0xa92},{0xaa9,0xaa9},{0xab1,0xab1},{0xab4,0xab4},{0xaba,0xabb}, + {0xac6,0xac6},{0xaca,0xaca},{0xace,0xacf},{0xad1,0xadf},{0xae4,0xae5}, + {0xaf2,0xaf8},{0xb00,0xb00},{0xb04,0xb04},{0xb0d,0xb0e},{0xb11,0xb12}, + {0xb29,0xb29},{0xb31,0xb31},{0xb34,0xb34},{0xb3a,0xb3b},{0xb45,0xb46}, + {0xb49,0xb4a},{0xb4e,0xb54},{0xb58,0xb5b},{0xb5e,0xb5e},{0xb64,0xb65}, + {0xb78,0xb81},{0xb84,0xb84},{0xb8b,0xb8d},{0xb91,0xb91},{0xb96,0xb98}, + {0xb9b,0xb9b},{0xb9d,0xb9d},{0xba0,0xba2},{0xba5,0xba7},{0xbab,0xbad}, + {0xbba,0xbbd},{0xbc3,0xbc5},{0xbc9,0xbc9},{0xbce,0xbcf},{0xbd1,0xbd6}, + {0xbd8,0xbe5},{0xbfb,0xbff},{0xc0d,0xc0d},{0xc11,0xc11},{0xc29,0xc29}, + {0xc3a,0xc3b},{0xc45,0xc45},{0xc49,0xc49},{0xc4e,0xc54},{0xc57,0xc57}, + {0xc5b,0xc5c},{0xc5e,0xc5f},{0xc64,0xc65},{0xc70,0xc76},{0xc8d,0xc8d}, + {0xc91,0xc91},{0xca9,0xca9},{0xcb4,0xcb4},{0xcba,0xcbb},{0xcc5,0xcc5}, + {0xcc9,0xcc9},{0xcce,0xcd4},{0xcd7,0xcdc},{0xcdf,0xcdf},{0xce4,0xce5}, + {0xcf0,0xcf0},{0xcf3,0xcff},{0xd0d,0xd0d},{0xd11,0xd11},{0xd45,0xd45}, + {0xd49,0xd49},{0xd50,0xd53},{0xd64,0xd65},{0xd80,0xd80},{0xd84,0xd84}, + {0xd97,0xd99},{0xdb2,0xdb2},{0xdbc,0xdbc},{0xdbe,0xdbf},{0xdc7,0xdc9}, + {0xdcb,0xdce},{0xdd5,0xdd5},{0xdd7,0xdd7},{0xde0,0xde5},{0xdf0,0xdf1}, + {0xdf5,0xe00},{0xe3b,0xe3e},{0xe5c,0xe80},{0xe83,0xe83},{0xe85,0xe85}, + {0xe8b,0xe8b},{0xea4,0xea4},{0xea6,0xea6},{0xebe,0xebf},{0xec5,0xec5}, + {0xec7,0xec7},{0xece,0xecf},{0xeda,0xedb},{0xee0,0xeff},{0xf48,0xf48}, + {0xf6d,0xf70},{0xf98,0xf98},{0xfbd,0xfbd},{0xfcd,0xfcd},{0xfdb,0xfff}, + {0x10c6,0x10c6},{0x10c8,0x10cc},{0x10ce,0x10cf},{0x1249,0x1249},{0x124e,0x124f}, + {0x1257,0x1257},{0x1259,0x1259},{0x125e,0x125f},{0x1289,0x1289},{0x128e,0x128f}, + {0x12b1,0x12b1},{0x12b6,0x12b7},{0x12bf,0x12bf},{0x12c1,0x12c1},{0x12c6,0x12c7}, + {0x12d7,0x12d7},{0x1311,0x1311},{0x1316,0x1317},{0x135b,0x135c},{0x137d,0x137f}, + {0x139a,0x139f},{0x13f6,0x13f7},{0x13fe,0x13ff},{0x169d,0x169f},{0x16f9,0x16ff}, + {0x1716,0x171e},{0x1737,0x173f},{0x1754,0x175f},{0x176d,0x176d},{0x1771,0x1771}, + {0x1774,0x177f},{0x17de,0x17df},{0x17ea,0x17ef},{0x17fa,0x17ff},{0x180e,0x180e}, + {0x181a,0x181f},{0x1879,0x187f},{0x18ab,0x18af},{0x18f6,0x18ff},{0x191f,0x191f}, + {0x192c,0x192f},{0x193c,0x193f},{0x1941,0x1943},{0x196e,0x196f},{0x1975,0x197f}, + {0x19ac,0x19af},{0x19ca,0x19cf},{0x19db,0x19dd},{0x1a1c,0x1a1d},{0x1a5f,0x1a5f}, + {0x1a7d,0x1a7e},{0x1a8a,0x1a8f},{0x1a9a,0x1a9f},{0x1aae,0x1aaf},{0x1acf,0x1aff}, + {0x1b4d,0x1b4f},{0x1b7f,0x1b7f},{0x1bf4,0x1bfb},{0x1c38,0x1c3a},{0x1c4a,0x1c4c}, + {0x1c89,0x1c8f},{0x1cbb,0x1cbc},{0x1cc8,0x1ccf},{0x1cfb,0x1cff},{0x1f16,0x1f17}, + {0x1f1e,0x1f1f},{0x1f46,0x1f47},{0x1f4e,0x1f4f},{0x1f58,0x1f58},{0x1f5a,0x1f5a}, + {0x1f5c,0x1f5c},{0x1f5e,0x1f5e},{0x1f7e,0x1f7f},{0x1fb5,0x1fb5},{0x1fc5,0x1fc5}, + {0x1fd4,0x1fd5},{0x1fdc,0x1fdc},{0x1ff0,0x1ff1},{0x1ff5,0x1ff5},{0x1fff,0x1fff}, + {0x200b,0x200f},{0x202a,0x202e},{0x2060,0x206f},{0x2072,0x2073},{0x208f,0x208f}, + {0x209d,0x209f},{0x20c1,0x20cf},{0x20f1,0x20ff},{0x218c,0x218f},{0x2427,0x243f}, + {0x244b,0x245f},{0x2b74,0x2b75},{0x2b96,0x2b96},{0x2cf4,0x2cf8},{0x2d26,0x2d26}, + {0x2d28,0x2d2c},{0x2d2e,0x2d2f},{0x2d68,0x2d6e},{0x2d71,0x2d7e},{0x2d97,0x2d9f}, + {0x2da7,0x2da7},{0x2daf,0x2daf},{0x2db7,0x2db7},{0x2dbf,0x2dbf},{0x2dc7,0x2dc7}, + {0x2dcf,0x2dcf},{0x2dd7,0x2dd7},{0x2ddf,0x2ddf},{0x2e5e,0x2e7f},{0x2e9a,0x2e9a}, + {0x2ef4,0x2eff},{0x2fd6,0x2fef},{0x2ffc,0x2fff},{0x3040,0x3040},{0x3097,0x3098}, + {0x3100,0x3104},{0x3130,0x3130},{0x318f,0x318f},{0x31e4,0x31ef},{0x321f,0x321f}, + {0xa48d,0xa48f},{0xa4c7,0xa4cf},{0xa62c,0xa63f},{0xa6f8,0xa6ff},{0xa7cb,0xa7cf}, + {0xa7d2,0xa7d2},{0xa7d4,0xa7d4},{0xa7da,0xa7f1},{0xa82d,0xa82f},{0xa83a,0xa83f}, + {0xa878,0xa87f},{0xa8c6,0xa8cd},{0xa8da,0xa8df},{0xa954,0xa95e},{0xa97d,0xa97f}, + {0xa9ce,0xa9ce},{0xa9da,0xa9dd},{0xa9ff,0xa9ff},{0xaa37,0xaa3f},{0xaa4e,0xaa4f}, + {0xaa5a,0xaa5b},{0xaac3,0xaada},{0xaaf7,0xab00},{0xab07,0xab08},{0xab0f,0xab10}, + {0xab17,0xab1f},{0xab27,0xab27},{0xab2f,0xab2f},{0xab6c,0xab6f},{0xabee,0xabef}, + {0xabfa,0xabff},{0xd7a4,0xd7af},{0xd7c7,0xd7ca},{0xd7fc,0xf8ff},{0xfa6e,0xfa6f}, + {0xfada,0xfaff},{0xfb07,0xfb12},{0xfb18,0xfb1c},{0xfb37,0xfb37},{0xfb3d,0xfb3d}, + {0xfb3f,0xfb3f},{0xfb42,0xfb42},{0xfb45,0xfb45},{0xfbc3,0xfbd2},{0xfd90,0xfd91}, + {0xfdc8,0xfdce},{0xfdd0,0xfdef},{0xfe1a,0xfe1f},{0xfe53,0xfe53},{0xfe67,0xfe67}, + {0xfe6c,0xfe6f},{0xfe75,0xfe75},{0xfefd,0xff00},{0xffbf,0xffc1},{0xffc8,0xffc9}, + {0xffd0,0xffd1},{0xffd8,0xffd9},{0xffdd,0xffdf},{0xffe7,0xffe7},{0xffef,0xfffb}, + {0xfffe,0xffff},{0x1000c,0x1000c},{0x10027,0x10027},{0x1003b,0x1003b},{0x1003e,0x1003e}, + {0x1004e,0x1004f},{0x1005e,0x1007f},{0x100fb,0x100ff},{0x10103,0x10106},{0x10134,0x10136}, + {0x1018f,0x1018f},{0x1019d,0x1019f},{0x101a1,0x101cf},{0x101fe,0x1027f},{0x1029d,0x1029f}, + {0x102d1,0x102df},{0x102fc,0x102ff},{0x10324,0x1032c},{0x1034b,0x1034f},{0x1037b,0x1037f}, + {0x1039e,0x1039e},{0x103c4,0x103c7},{0x103d6,0x103ff},{0x1049e,0x1049f},{0x104aa,0x104af}, + {0x104d4,0x104d7},{0x104fc,0x104ff},{0x10528,0x1052f},{0x10564,0x1056e},{0x1057b,0x1057b}, + {0x1058b,0x1058b},{0x10593,0x10593},{0x10596,0x10596},{0x105a2,0x105a2},{0x105b2,0x105b2}, + {0x105ba,0x105ba},{0x105bd,0x105ff},{0x10737,0x1073f},{0x10756,0x1075f},{0x10768,0x1077f}, + {0x10786,0x10786},{0x107b1,0x107b1},{0x107bb,0x107ff},{0x10806,0x10807},{0x10809,0x10809}, + {0x10836,0x10836},{0x10839,0x1083b},{0x1083d,0x1083e},{0x10856,0x10856},{0x1089f,0x108a6}, + {0x108b0,0x108df},{0x108f3,0x108f3},{0x108f6,0x108fa},{0x1091c,0x1091e},{0x1093a,0x1093e}, + {0x10940,0x1097f},{0x109b8,0x109bb},{0x109d0,0x109d1},{0x10a04,0x10a04},{0x10a07,0x10a0b}, + {0x10a14,0x10a14},{0x10a18,0x10a18},{0x10a36,0x10a37},{0x10a3b,0x10a3e},{0x10a49,0x10a4f}, + {0x10a59,0x10a5f},{0x10aa0,0x10abf},{0x10ae7,0x10aea},{0x10af7,0x10aff},{0x10b36,0x10b38}, + {0x10b56,0x10b57},{0x10b73,0x10b77},{0x10b92,0x10b98},{0x10b9d,0x10ba8},{0x10bb0,0x10bff}, + {0x10c49,0x10c7f},{0x10cb3,0x10cbf},{0x10cf3,0x10cf9},{0x10d28,0x10d2f},{0x10d3a,0x10e5f}, + {0x10e7f,0x10e7f},{0x10eaa,0x10eaa},{0x10eae,0x10eaf},{0x10eb2,0x10eff},{0x10f28,0x10f2f}, + {0x10f5a,0x10f6f},{0x10f8a,0x10faf},{0x10fcc,0x10fdf},{0x10ff7,0x10fff},{0x1104e,0x11051}, + {0x11076,0x1107e},{0x110bd,0x110bd},{0x110c3,0x110cf},{0x110e9,0x110ef},{0x110fa,0x110ff}, + {0x11135,0x11135},{0x11148,0x1114f},{0x11177,0x1117f},{0x111e0,0x111e0},{0x111f5,0x111ff}, + {0x11212,0x11212},{0x1123f,0x1127f},{0x11287,0x11287},{0x11289,0x11289},{0x1128e,0x1128e}, + {0x1129e,0x1129e},{0x112aa,0x112af},{0x112eb,0x112ef},{0x112fa,0x112ff},{0x11304,0x11304}, + {0x1130d,0x1130e},{0x11311,0x11312},{0x11329,0x11329},{0x11331,0x11331},{0x11334,0x11334}, + {0x1133a,0x1133a},{0x11345,0x11346},{0x11349,0x1134a},{0x1134e,0x1134f},{0x11351,0x11356}, + {0x11358,0x1135c},{0x11364,0x11365},{0x1136d,0x1136f},{0x11375,0x113ff},{0x1145c,0x1145c}, + {0x11462,0x1147f},{0x114c8,0x114cf},{0x114da,0x1157f},{0x115b6,0x115b7},{0x115de,0x115ff}, + {0x11645,0x1164f},{0x1165a,0x1165f},{0x1166d,0x1167f},{0x116ba,0x116bf},{0x116ca,0x116ff}, + {0x1171b,0x1171c},{0x1172c,0x1172f},{0x11747,0x117ff},{0x1183c,0x1189f},{0x118f3,0x118fe}, + {0x11907,0x11908},{0x1190a,0x1190b},{0x11914,0x11914},{0x11917,0x11917},{0x11936,0x11936}, + {0x11939,0x1193a},{0x11947,0x1194f},{0x1195a,0x1199f},{0x119a8,0x119a9},{0x119d8,0x119d9}, + {0x119e5,0x119ff},{0x11a48,0x11a4f},{0x11aa3,0x11aaf},{0x11af9,0x11bff},{0x11c09,0x11c09}, + {0x11c37,0x11c37},{0x11c46,0x11c4f},{0x11c6d,0x11c6f},{0x11c90,0x11c91},{0x11ca8,0x11ca8}, + {0x11cb7,0x11cff},{0x11d07,0x11d07},{0x11d0a,0x11d0a},{0x11d37,0x11d39},{0x11d3b,0x11d3b}, + {0x11d3e,0x11d3e},{0x11d48,0x11d4f},{0x11d5a,0x11d5f},{0x11d66,0x11d66},{0x11d69,0x11d69}, + {0x11d8f,0x11d8f},{0x11d92,0x11d92},{0x11d99,0x11d9f},{0x11daa,0x11edf},{0x11ef9,0x11faf}, + {0x11fb1,0x11fbf},{0x11ff2,0x11ffe},{0x1239a,0x123ff},{0x1246f,0x1246f},{0x12475,0x1247f}, + {0x12544,0x12f8f},{0x12ff3,0x12fff},{0x1342f,0x143ff},{0x14647,0x167ff},{0x16a39,0x16a3f}, + {0x16a5f,0x16a5f},{0x16a6a,0x16a6d},{0x16abf,0x16abf},{0x16aca,0x16acf},{0x16aee,0x16aef}, + {0x16af6,0x16aff},{0x16b46,0x16b4f},{0x16b5a,0x16b5a},{0x16b62,0x16b62},{0x16b78,0x16b7c}, + {0x16b90,0x16e3f},{0x16e9b,0x16eff},{0x16f4b,0x16f4e},{0x16f88,0x16f8e},{0x16fa0,0x16fdf}, + {0x16fe5,0x16fef},{0x16ff2,0x16fff},{0x187f8,0x187ff},{0x18cd6,0x18cff},{0x18d09,0x1afef}, + {0x1aff4,0x1aff4},{0x1affc,0x1affc},{0x1afff,0x1afff},{0x1b123,0x1b14f},{0x1b153,0x1b163}, + {0x1b168,0x1b16f},{0x1b2fc,0x1bbff},{0x1bc6b,0x1bc6f},{0x1bc7d,0x1bc7f},{0x1bc89,0x1bc8f}, + {0x1bc9a,0x1bc9b},{0x1bca0,0x1ceff},{0x1cf2e,0x1cf2f},{0x1cf47,0x1cf4f},{0x1cfc4,0x1cfff}, + {0x1d0f6,0x1d0ff},{0x1d127,0x1d128},{0x1d173,0x1d17a},{0x1d1eb,0x1d1ff},{0x1d246,0x1d2df}, + {0x1d2f4,0x1d2ff},{0x1d357,0x1d35f},{0x1d379,0x1d3ff},{0x1d455,0x1d455},{0x1d49d,0x1d49d}, + {0x1d4a0,0x1d4a1},{0x1d4a3,0x1d4a4},{0x1d4a7,0x1d4a8},{0x1d4ad,0x1d4ad},{0x1d4ba,0x1d4ba}, + {0x1d4bc,0x1d4bc},{0x1d4c4,0x1d4c4},{0x1d506,0x1d506},{0x1d50b,0x1d50c},{0x1d515,0x1d515}, + {0x1d51d,0x1d51d},{0x1d53a,0x1d53a},{0x1d53f,0x1d53f},{0x1d545,0x1d545},{0x1d547,0x1d549}, + {0x1d551,0x1d551},{0x1d6a6,0x1d6a7},{0x1d7cc,0x1d7cd},{0x1da8c,0x1da9a},{0x1daa0,0x1daa0}, + {0x1dab0,0x1deff},{0x1df1f,0x1dfff},{0x1e007,0x1e007},{0x1e019,0x1e01a},{0x1e022,0x1e022}, + {0x1e025,0x1e025},{0x1e02b,0x1e0ff},{0x1e12d,0x1e12f},{0x1e13e,0x1e13f},{0x1e14a,0x1e14d}, + {0x1e150,0x1e28f},{0x1e2af,0x1e2bf},{0x1e2fa,0x1e2fe},{0x1e300,0x1e7df},{0x1e7e7,0x1e7e7}, + {0x1e7ec,0x1e7ec},{0x1e7ef,0x1e7ef},{0x1e7ff,0x1e7ff},{0x1e8c5,0x1e8c6},{0x1e8d7,0x1e8ff}, + {0x1e94c,0x1e94f},{0x1e95a,0x1e95d},{0x1e960,0x1ec70},{0x1ecb5,0x1ed00},{0x1ed3e,0x1edff}, + {0x1ee04,0x1ee04},{0x1ee20,0x1ee20},{0x1ee23,0x1ee23},{0x1ee25,0x1ee26},{0x1ee28,0x1ee28}, + {0x1ee33,0x1ee33},{0x1ee38,0x1ee38},{0x1ee3a,0x1ee3a},{0x1ee3c,0x1ee41},{0x1ee43,0x1ee46}, + {0x1ee48,0x1ee48},{0x1ee4a,0x1ee4a},{0x1ee4c,0x1ee4c},{0x1ee50,0x1ee50},{0x1ee53,0x1ee53}, + {0x1ee55,0x1ee56},{0x1ee58,0x1ee58},{0x1ee5a,0x1ee5a},{0x1ee5c,0x1ee5c},{0x1ee5e,0x1ee5e}, + {0x1ee60,0x1ee60},{0x1ee63,0x1ee63},{0x1ee65,0x1ee66},{0x1ee6b,0x1ee6b},{0x1ee73,0x1ee73}, + {0x1ee78,0x1ee78},{0x1ee7d,0x1ee7d},{0x1ee7f,0x1ee7f},{0x1ee8a,0x1ee8a},{0x1ee9c,0x1eea0}, + {0x1eea4,0x1eea4},{0x1eeaa,0x1eeaa},{0x1eebc,0x1eeef},{0x1eef2,0x1efff},{0x1f02c,0x1f02f}, + {0x1f094,0x1f09f},{0x1f0af,0x1f0b0},{0x1f0c0,0x1f0c0},{0x1f0d0,0x1f0d0},{0x1f0f6,0x1f0ff}, + {0x1f1ae,0x1f1e5},{0x1f203,0x1f20f},{0x1f23c,0x1f23f},{0x1f249,0x1f24f},{0x1f252,0x1f25f}, + {0x1f266,0x1f2ff},{0x1f6d8,0x1f6dc},{0x1f6ed,0x1f6ef},{0x1f6fd,0x1f6ff},{0x1f774,0x1f77f}, + {0x1f7d9,0x1f7df},{0x1f7ec,0x1f7ef},{0x1f7f1,0x1f7ff},{0x1f80c,0x1f80f},{0x1f848,0x1f84f}, + {0x1f85a,0x1f85f},{0x1f888,0x1f88f},{0x1f8ae,0x1f8af},{0x1f8b2,0x1f8ff},{0x1fa54,0x1fa5f}, + {0x1fa6e,0x1fa6f},{0x1fa75,0x1fa77},{0x1fa7d,0x1fa7f},{0x1fa87,0x1fa8f},{0x1faad,0x1faaf}, + {0x1fabb,0x1fabf},{0x1fac6,0x1facf},{0x1fada,0x1fadf},{0x1fae8,0x1faef},{0x1faf7,0x1faff}, + {0x1fb93,0x1fb93},{0x1fbcb,0x1fbef},{0x1fbfa,0x1ffff},{0x2a6e0,0x2a6ff},{0x2b739,0x2b73f}, + {0x2b81e,0x2b81f},{0x2cea2,0x2ceaf},{0x2ebe1,0x2f7ff},{0x2fa1e,0x2ffff},{0x3134b,0xe00ff}, + {0x3134b,0xe00ff} +}; + +// This category is not official and is only used for regex purposes +static const std::vector> whitespace_ranges = { + {0x0009, 0x0009}, {0x000A, 0x000A}, {0x000B, 0x000B}, {0x000C, 0x000C}, + {0x000D, 0x000D}, {0x0020, 0x0020}, {0x85, 0x85}, {0xa0, 0xa0}, {0x1680, 0x1680}, + {0x2000, 0x200a}, {0x2028, 0x2029}, {0x202f, 0x202f}, {0x205f, 0x205f}, {0x3000, 0x3000} +}; + +static std::vector> all_ranges; + +static std::map, int> codepoint_type_map; + +static std::string codepoint_to_utf8(uint32_t cp) { + std::string result; + if (/* 0x00 <= cp && */ cp <= 0x7f) { + result.push_back(cp); + } + else if (0x80 <= cp && cp <= 0x7ff) { + result.push_back(0xc0 | ((cp >> 6) & 0x1f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else if (0x800 <= cp && cp <= 0xffff) { + result.push_back(0xe0 | ((cp >> 12) & 0x0f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else if (0x10000 <= cp && cp <= 0x10ffff) { + result.push_back(0xf0 | ((cp >> 18) & 0x07)); + result.push_back(0x80 | ((cp >> 12) & 0x3f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else { + throw std::invalid_argument("invalid codepoint"); + } + return result; +} + +static std::string codepoints_to_utf8(const std::vector & cps) { + std::string result; + for (size_t i = 0; i < cps.size(); ++i) { + result.append(codepoint_to_utf8(cps[i])); + } + return result; +} + +static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { + assert(offset < utf8.size()); + if (!(utf8[offset + 0] & 0x80)) { + auto result = utf8[offset + 0]; + offset += 1; + return result; + } + else if (!(utf8[offset + 0] & 0x40)) { + throw std::invalid_argument("invalid character"); + } + else if (!(utf8[offset + 0] & 0x20)) { + if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); + offset += 2; + return result; + } + else if (!(utf8[offset + 0] & 0x10)) { + if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); + offset += 3; + return result; + } + else if (!(utf8[offset + 0] & 0x08)) { + if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); + offset += 4; + return result; + } + throw std::invalid_argument("invalid string"); +} + +static std::vector codepoints_from_utf8(const std::string & utf8) { + std::vector result; + size_t offset = 0; + while (offset < utf8.size()) { + result.push_back(codepoint_from_utf8(utf8, offset)); + } + return result; +} + + + +#define CODEPOINT_TYPE_UNIDENTIFIED 0 +#define CODEPOINT_TYPE_OTHER 1 +#define CODEPOINT_TYPE_NUMBER 2 +#define CODEPOINT_TYPE_LETTER 3 +#define CODEPOINT_TYPE_PUNCTUATION 4 +#define CODEPOINT_TYPE_MARK 5 +#define CODEPOINT_TYPE_SEPARATOR 6 +#define CODEPOINT_TYPE_SYMBOL 7 + + +static bool codepoint_type_init_map() { + for (auto i : other_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_OTHER; + } + for (auto i : number_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_NUMBER; + } + for (auto i : letter_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_LETTER; + } + for (auto i : punctuation_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_PUNCTUATION; + } + for (auto i : mark_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_MARK; + } + for (auto i : separator_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_SEPARATOR; + } + for (auto i : symbol_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_SYMBOL; + } + return true; +} + +static bool codepoint_type_init_search_vector() { + all_ranges.insert(all_ranges.end(), other_ranges.begin(), other_ranges.end()); + all_ranges.insert(all_ranges.end(), number_ranges.begin(), number_ranges.end()); + all_ranges.insert(all_ranges.end(), letter_ranges.begin(), letter_ranges.end()); + all_ranges.insert(all_ranges.end(), punctuation_ranges.begin(), punctuation_ranges.end()); + all_ranges.insert(all_ranges.end(), mark_ranges.begin(), mark_ranges.end()); + all_ranges.insert(all_ranges.end(), separator_ranges.begin(), separator_ranges.end()); + all_ranges.insert(all_ranges.end(), symbol_ranges.begin(), symbol_ranges.end()); + std::sort(all_ranges.begin(), all_ranges.end()); + return true; +} + +static size_t binary_search_implement(uint32_t cp, const std::vector> & ranges) { + size_t left = 0; + size_t right = ranges.size() - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + const auto& range = ranges[mid]; + + if (cp >= range.first && cp <= range.second) { + // Target is within the range of the current pair. + return mid; + } else if (cp < range.first) { + // Target is less than the start of the range, search in the left half. + right = mid - 1; + } else { + // Target is greater than the end of the range, search in the right half. + left = mid + 1; + } + } + throw std::runtime_error("Target out of range!"); +} + +static int codepoint_type_binary_search(uint32_t cp) { + try { + auto result = binary_search_implement(cp, all_ranges); + return codepoint_type_map[all_ranges[result]]; + } catch (const std::runtime_error & e) { + return CODEPOINT_TYPE_UNIDENTIFIED; + } +} + +static bool codepoint_type_init() { + bool map_initialized = codepoint_type_init_map(); + bool sv_initialized = codepoint_type_init_search_vector(); + + if(map_initialized && sv_initialized) { + return true; + } + + return false; +} + +static int codepoint_type(uint32_t cp) { + static bool codepoint_type_initialized = codepoint_type_init(); + return codepoint_type_binary_search(cp); +} + +static int codepoint_type(const std::string & utf8) { + if (utf8.length() == 0) { + return CODEPOINT_TYPE_UNIDENTIFIED; + } + return codepoint_type(codepoints_from_utf8(utf8).back()); +} + +static bool codepoint_is_whitespace(const std::string & utf8) { + if (utf8.length() == 0) { + return false; + } + + try { + binary_search_implement(codepoints_from_utf8(utf8).back(), whitespace_ranges); + return true; + } catch (const std::runtime_error & e) { + return false; + } +} + +static std::unordered_map bytes_to_unicode_map_bpe() { + std::unordered_map map; + for (int ch = u'!'; ch <= u'~'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = u'¡'; ch <= u'¬'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(ch) == map.end()) { + map[ch] = codepoint_to_utf8(256 + n); + ++n; + } + } + return map; +} + +static std::string bytes_to_unicode_bpe(uint8_t byte) { + static std::unordered_map map = bytes_to_unicode_map_bpe(); + return map.at(byte); +} + +static std::unordered_map unicode_to_bytes_map_bpe() { + std::unordered_map map; + for (int ch = u'!'; ch <= u'~'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = u'¡'; ch <= u'¬'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(codepoint_to_utf8(ch)) == map.end()) { + map[codepoint_to_utf8(256 + n)] = ch; + ++n; + } + } + return map; +} + +static uint8_t unicode_to_bytes_bpe(const std::string & utf8) { + static std::unordered_map map = unicode_to_bytes_map_bpe(); + return map.at(utf8); +} + + diff --git a/unicode.h b/unicode.h new file mode 100644 index 00000000000..bf7f105dfaf --- /dev/null +++ b/unicode.h @@ -0,0 +1,742 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +static const std::vector> number_ranges = { + {0x30,0x39},{0xb2,0xb3},{0xb9,0xb9},{0xbc,0xbe},{0x660,0x669}, + {0x6f0,0x6f9},{0x7c0,0x7c9},{0x966,0x96f},{0x9e6,0x9ef},{0x9f4,0x9f9}, + {0xa66,0xa6f},{0xae6,0xaef},{0xb66,0xb6f},{0xb72,0xb77},{0xbe6,0xbf2}, + {0xc66,0xc6f},{0xc78,0xc7e},{0xce6,0xcef},{0xd58,0xd5e},{0xd66,0xd78}, + {0xde6,0xdef},{0xe50,0xe59},{0xed0,0xed9},{0xf20,0xf33},{0x1040,0x1049}, + {0x1090,0x1099},{0x1369,0x137c},{0x16ee,0x16f0},{0x17e0,0x17e9},{0x17f0,0x17f9}, + {0x1810,0x1819},{0x1946,0x194f},{0x19d0,0x19da},{0x1a80,0x1a89},{0x1a90,0x1a99}, + {0x1b50,0x1b59},{0x1bb0,0x1bb9},{0x1c40,0x1c49},{0x1c50,0x1c59},{0x2070,0x2070}, + {0x2074,0x2079},{0x2080,0x2089},{0x2150,0x2182},{0x2185,0x2189},{0x2460,0x249b}, + {0x24ea,0x24ff},{0x2776,0x2793},{0x2cfd,0x2cfd},{0x3007,0x3007},{0x3021,0x3029}, + {0x3038,0x303a},{0x3192,0x3195},{0x3220,0x3229},{0x3248,0x324f},{0x3251,0x325f}, + {0x3280,0x3289},{0x32b1,0x32bf},{0xa620,0xa629},{0xa6e6,0xa6ef},{0xa830,0xa835}, + {0xa8d0,0xa8d9},{0xa900,0xa909},{0xa9d0,0xa9d9},{0xa9f0,0xa9f9},{0xaa50,0xaa59}, + {0xabf0,0xabf9},{0xff10,0xff19},{0x10107,0x10133},{0x10140,0x10178},{0x1018a,0x1018b}, + {0x102e1,0x102fb},{0x10320,0x10323},{0x10341,0x10341},{0x1034a,0x1034a},{0x103d1,0x103d5}, + {0x104a0,0x104a9},{0x10858,0x1085f},{0x10879,0x1087f},{0x108a7,0x108af},{0x108fb,0x108ff}, + {0x10916,0x1091b},{0x109bc,0x109bd},{0x109c0,0x109cf},{0x109d2,0x109ff},{0x10a40,0x10a48}, + {0x10a7d,0x10a7e},{0x10a9d,0x10a9f},{0x10aeb,0x10aef},{0x10b58,0x10b5f},{0x10b78,0x10b7f}, + {0x10ba9,0x10baf},{0x10cfa,0x10cff},{0x10d30,0x10d39},{0x10e60,0x10e7e},{0x10f1d,0x10f26}, + {0x10f51,0x10f54},{0x10fc5,0x10fcb},{0x11052,0x1106f},{0x110f0,0x110f9},{0x11136,0x1113f}, + {0x111d0,0x111d9},{0x111e1,0x111f4},{0x112f0,0x112f9},{0x11450,0x11459},{0x114d0,0x114d9}, + {0x11650,0x11659},{0x116c0,0x116c9},{0x11730,0x1173b},{0x118e0,0x118f2},{0x11950,0x11959}, + {0x11c50,0x11c6c},{0x11d50,0x11d59},{0x11da0,0x11da9},{0x11fc0,0x11fd4},{0x12400,0x1246e}, + {0x16a60,0x16a69},{0x16ac0,0x16ac9},{0x16b50,0x16b59},{0x16b5b,0x16b61},{0x16e80,0x16e96}, + {0x1d2e0,0x1d2f3},{0x1d360,0x1d378},{0x1d7ce,0x1d7ff},{0x1e140,0x1e149},{0x1e2f0,0x1e2f9}, + {0x1e8c7,0x1e8cf},{0x1e950,0x1e959},{0x1ec71,0x1ecab},{0x1ecad,0x1ecaf},{0x1ecb1,0x1ecb4}, + {0x1ed01,0x1ed2d},{0x1ed2f,0x1ed3d},{0x1f100,0x1f10c},{0x1f100,0x1f10c} +}; + +static const std::vector> letter_ranges = { + {0x41,0x5a},{0x61,0x7a},{0xaa,0xaa},{0xb5,0xb5},{0xba,0xba}, + {0xc0,0xd6},{0xd8,0xf6},{0xf8,0x2c1},{0x2c6,0x2d1},{0x2e0,0x2e4}, + {0x2ec,0x2ec},{0x2ee,0x2ee},{0x370,0x374},{0x376,0x377},{0x37a,0x37d}, + {0x37f,0x37f},{0x386,0x386},{0x388,0x38a},{0x38c,0x38c},{0x38e,0x3a1}, + {0x3a3,0x3f5},{0x3f7,0x481},{0x48a,0x52f},{0x531,0x556},{0x559,0x559}, + {0x560,0x588},{0x5d0,0x5ea},{0x5ef,0x5f2},{0x620,0x64a},{0x66e,0x66f}, + {0x671,0x6d3},{0x6d5,0x6d5},{0x6e5,0x6e6},{0x6ee,0x6ef},{0x6fa,0x6fc}, + {0x6ff,0x6ff},{0x710,0x710},{0x712,0x72f},{0x74d,0x7a5},{0x7b1,0x7b1}, + {0x7ca,0x7ea},{0x7f4,0x7f5},{0x7fa,0x7fa},{0x800,0x815},{0x81a,0x81a}, + {0x824,0x824},{0x828,0x828},{0x840,0x858},{0x860,0x86a},{0x870,0x887}, + {0x889,0x88e},{0x8a0,0x8c9},{0x904,0x939},{0x93d,0x93d},{0x950,0x950}, + {0x958,0x961},{0x971,0x980},{0x985,0x98c},{0x98f,0x990},{0x993,0x9a8}, + {0x9aa,0x9b0},{0x9b2,0x9b2},{0x9b6,0x9b9},{0x9bd,0x9bd},{0x9ce,0x9ce}, + {0x9dc,0x9dd},{0x9df,0x9e1},{0x9f0,0x9f1},{0x9fc,0x9fc},{0xa05,0xa0a}, + {0xa0f,0xa10},{0xa13,0xa28},{0xa2a,0xa30},{0xa32,0xa33},{0xa35,0xa36}, + {0xa38,0xa39},{0xa59,0xa5c},{0xa5e,0xa5e},{0xa72,0xa74},{0xa85,0xa8d}, + {0xa8f,0xa91},{0xa93,0xaa8},{0xaaa,0xab0},{0xab2,0xab3},{0xab5,0xab9}, + {0xabd,0xabd},{0xad0,0xad0},{0xae0,0xae1},{0xaf9,0xaf9},{0xb05,0xb0c}, + {0xb0f,0xb10},{0xb13,0xb28},{0xb2a,0xb30},{0xb32,0xb33},{0xb35,0xb39}, + {0xb3d,0xb3d},{0xb5c,0xb5d},{0xb5f,0xb61},{0xb71,0xb71},{0xb83,0xb83}, + {0xb85,0xb8a},{0xb8e,0xb90},{0xb92,0xb95},{0xb99,0xb9a},{0xb9c,0xb9c}, + {0xb9e,0xb9f},{0xba3,0xba4},{0xba8,0xbaa},{0xbae,0xbb9},{0xbd0,0xbd0}, + {0xc05,0xc0c},{0xc0e,0xc10},{0xc12,0xc28},{0xc2a,0xc39},{0xc3d,0xc3d}, + {0xc58,0xc5a},{0xc5d,0xc5d},{0xc60,0xc61},{0xc80,0xc80},{0xc85,0xc8c}, + {0xc8e,0xc90},{0xc92,0xca8},{0xcaa,0xcb3},{0xcb5,0xcb9},{0xcbd,0xcbd}, + {0xcdd,0xcde},{0xce0,0xce1},{0xcf1,0xcf2},{0xd04,0xd0c},{0xd0e,0xd10}, + {0xd12,0xd3a},{0xd3d,0xd3d},{0xd4e,0xd4e},{0xd54,0xd56},{0xd5f,0xd61}, + {0xd7a,0xd7f},{0xd85,0xd96},{0xd9a,0xdb1},{0xdb3,0xdbb},{0xdbd,0xdbd}, + {0xdc0,0xdc6},{0xe01,0xe30},{0xe32,0xe33},{0xe40,0xe46},{0xe81,0xe82}, + {0xe84,0xe84},{0xe86,0xe8a},{0xe8c,0xea3},{0xea5,0xea5},{0xea7,0xeb0}, + {0xeb2,0xeb3},{0xebd,0xebd},{0xec0,0xec4},{0xec6,0xec6},{0xedc,0xedf}, + {0xf00,0xf00},{0xf40,0xf47},{0xf49,0xf6c},{0xf88,0xf8c},{0x1000,0x102a}, + {0x103f,0x103f},{0x1050,0x1055},{0x105a,0x105d},{0x1061,0x1061},{0x1065,0x1066}, + {0x106e,0x1070},{0x1075,0x1081},{0x108e,0x108e},{0x10a0,0x10c5},{0x10c7,0x10c7}, + {0x10cd,0x10cd},{0x10d0,0x10fa},{0x10fc,0x1248},{0x124a,0x124d},{0x1250,0x1256}, + {0x1258,0x1258},{0x125a,0x125d},{0x1260,0x1288},{0x128a,0x128d},{0x1290,0x12b0}, + {0x12b2,0x12b5},{0x12b8,0x12be},{0x12c0,0x12c0},{0x12c2,0x12c5},{0x12c8,0x12d6}, + {0x12d8,0x1310},{0x1312,0x1315},{0x1318,0x135a},{0x1380,0x138f},{0x13a0,0x13f5}, + {0x13f8,0x13fd},{0x1401,0x166c},{0x166f,0x167f},{0x1681,0x169a},{0x16a0,0x16ea}, + {0x16f1,0x16f8},{0x1700,0x1711},{0x171f,0x1731},{0x1740,0x1751},{0x1760,0x176c}, + {0x176e,0x1770},{0x1780,0x17b3},{0x17d7,0x17d7},{0x17dc,0x17dc},{0x1820,0x1878}, + {0x1880,0x1884},{0x1887,0x18a8},{0x18aa,0x18aa},{0x18b0,0x18f5},{0x1900,0x191e}, + {0x1950,0x196d},{0x1970,0x1974},{0x1980,0x19ab},{0x19b0,0x19c9},{0x1a00,0x1a16}, + {0x1a20,0x1a54},{0x1aa7,0x1aa7},{0x1b05,0x1b33},{0x1b45,0x1b4c},{0x1b83,0x1ba0}, + {0x1bae,0x1baf},{0x1bba,0x1be5},{0x1c00,0x1c23},{0x1c4d,0x1c4f},{0x1c5a,0x1c7d}, + {0x1c80,0x1c88},{0x1c90,0x1cba},{0x1cbd,0x1cbf},{0x1ce9,0x1cec},{0x1cee,0x1cf3}, + {0x1cf5,0x1cf6},{0x1cfa,0x1cfa},{0x1d00,0x1dbf},{0x1e00,0x1f15},{0x1f18,0x1f1d}, + {0x1f20,0x1f45},{0x1f48,0x1f4d},{0x1f50,0x1f57},{0x1f59,0x1f59},{0x1f5b,0x1f5b}, + {0x1f5d,0x1f5d},{0x1f5f,0x1f7d},{0x1f80,0x1fb4},{0x1fb6,0x1fbc},{0x1fbe,0x1fbe}, + {0x1fc2,0x1fc4},{0x1fc6,0x1fcc},{0x1fd0,0x1fd3},{0x1fd6,0x1fdb},{0x1fe0,0x1fec}, + {0x1ff2,0x1ff4},{0x1ff6,0x1ffc},{0x2071,0x2071},{0x207f,0x207f},{0x2090,0x209c}, + {0x2102,0x2102},{0x2107,0x2107},{0x210a,0x2113},{0x2115,0x2115},{0x2119,0x211d}, + {0x2124,0x2124},{0x2126,0x2126},{0x2128,0x2128},{0x212a,0x212d},{0x212f,0x2139}, + {0x213c,0x213f},{0x2145,0x2149},{0x214e,0x214e},{0x2183,0x2184},{0x2c00,0x2ce4}, + {0x2ceb,0x2cee},{0x2cf2,0x2cf3},{0x2d00,0x2d25},{0x2d27,0x2d27},{0x2d2d,0x2d2d}, + {0x2d30,0x2d67},{0x2d6f,0x2d6f},{0x2d80,0x2d96},{0x2da0,0x2da6},{0x2da8,0x2dae}, + {0x2db0,0x2db6},{0x2db8,0x2dbe},{0x2dc0,0x2dc6},{0x2dc8,0x2dce},{0x2dd0,0x2dd6}, + {0x2dd8,0x2dde},{0x2e2f,0x2e2f},{0x3005,0x3006},{0x3031,0x3035},{0x303b,0x303c}, + {0x3041,0x3096},{0x309d,0x309f},{0x30a1,0x30fa},{0x30fc,0x30ff},{0x3105,0x312f}, + {0x3131,0x318e},{0x31a0,0x31bf},{0x31f0,0x31ff},{0x3400,0x4dbf},{0x4e00,0xa48c}, + {0xa4d0,0xa4fd},{0xa500,0xa60c},{0xa610,0xa61f},{0xa62a,0xa62b},{0xa640,0xa66e}, + {0xa67f,0xa69d},{0xa6a0,0xa6e5},{0xa717,0xa71f},{0xa722,0xa788},{0xa78b,0xa7ca}, + {0xa7d0,0xa7d1},{0xa7d3,0xa7d3},{0xa7d5,0xa7d9},{0xa7f2,0xa801},{0xa803,0xa805}, + {0xa807,0xa80a},{0xa80c,0xa822},{0xa840,0xa873},{0xa882,0xa8b3},{0xa8f2,0xa8f7}, + {0xa8fb,0xa8fb},{0xa8fd,0xa8fe},{0xa90a,0xa925},{0xa930,0xa946},{0xa960,0xa97c}, + {0xa984,0xa9b2},{0xa9cf,0xa9cf},{0xa9e0,0xa9e4},{0xa9e6,0xa9ef},{0xa9fa,0xa9fe}, + {0xaa00,0xaa28},{0xaa40,0xaa42},{0xaa44,0xaa4b},{0xaa60,0xaa76},{0xaa7a,0xaa7a}, + {0xaa7e,0xaaaf},{0xaab1,0xaab1},{0xaab5,0xaab6},{0xaab9,0xaabd},{0xaac0,0xaac0}, + {0xaac2,0xaac2},{0xaadb,0xaadd},{0xaae0,0xaaea},{0xaaf2,0xaaf4},{0xab01,0xab06}, + {0xab09,0xab0e},{0xab11,0xab16},{0xab20,0xab26},{0xab28,0xab2e},{0xab30,0xab5a}, + {0xab5c,0xab69},{0xab70,0xabe2},{0xac00,0xd7a3},{0xd7b0,0xd7c6},{0xd7cb,0xd7fb}, + {0xf900,0xfa6d},{0xfa70,0xfad9},{0xfb00,0xfb06},{0xfb13,0xfb17},{0xfb1d,0xfb1d}, + {0xfb1f,0xfb28},{0xfb2a,0xfb36},{0xfb38,0xfb3c},{0xfb3e,0xfb3e},{0xfb40,0xfb41}, + {0xfb43,0xfb44},{0xfb46,0xfbb1},{0xfbd3,0xfd3d},{0xfd50,0xfd8f},{0xfd92,0xfdc7}, + {0xfdf0,0xfdfb},{0xfe70,0xfe74},{0xfe76,0xfefc},{0xff21,0xff3a},{0xff41,0xff5a}, + {0xff66,0xffbe},{0xffc2,0xffc7},{0xffca,0xffcf},{0xffd2,0xffd7},{0xffda,0xffdc}, + {0x10000,0x1000b},{0x1000d,0x10026},{0x10028,0x1003a},{0x1003c,0x1003d},{0x1003f,0x1004d}, + {0x10050,0x1005d},{0x10080,0x100fa},{0x10280,0x1029c},{0x102a0,0x102d0},{0x10300,0x1031f}, + {0x1032d,0x10340},{0x10342,0x10349},{0x10350,0x10375},{0x10380,0x1039d},{0x103a0,0x103c3}, + {0x103c8,0x103cf},{0x10400,0x1049d},{0x104b0,0x104d3},{0x104d8,0x104fb},{0x10500,0x10527}, + {0x10530,0x10563},{0x10570,0x1057a},{0x1057c,0x1058a},{0x1058c,0x10592},{0x10594,0x10595}, + {0x10597,0x105a1},{0x105a3,0x105b1},{0x105b3,0x105b9},{0x105bb,0x105bc},{0x10600,0x10736}, + {0x10740,0x10755},{0x10760,0x10767},{0x10780,0x10785},{0x10787,0x107b0},{0x107b2,0x107ba}, + {0x10800,0x10805},{0x10808,0x10808},{0x1080a,0x10835},{0x10837,0x10838},{0x1083c,0x1083c}, + {0x1083f,0x10855},{0x10860,0x10876},{0x10880,0x1089e},{0x108e0,0x108f2},{0x108f4,0x108f5}, + {0x10900,0x10915},{0x10920,0x10939},{0x10980,0x109b7},{0x109be,0x109bf},{0x10a00,0x10a00}, + {0x10a10,0x10a13},{0x10a15,0x10a17},{0x10a19,0x10a35},{0x10a60,0x10a7c},{0x10a80,0x10a9c}, + {0x10ac0,0x10ac7},{0x10ac9,0x10ae4},{0x10b00,0x10b35},{0x10b40,0x10b55},{0x10b60,0x10b72}, + {0x10b80,0x10b91},{0x10c00,0x10c48},{0x10c80,0x10cb2},{0x10cc0,0x10cf2},{0x10d00,0x10d23}, + {0x10e80,0x10ea9},{0x10eb0,0x10eb1},{0x10f00,0x10f1c},{0x10f27,0x10f27},{0x10f30,0x10f45}, + {0x10f70,0x10f81},{0x10fb0,0x10fc4},{0x10fe0,0x10ff6},{0x11003,0x11037},{0x11071,0x11072}, + {0x11075,0x11075},{0x11083,0x110af},{0x110d0,0x110e8},{0x11103,0x11126},{0x11144,0x11144}, + {0x11147,0x11147},{0x11150,0x11172},{0x11176,0x11176},{0x11183,0x111b2},{0x111c1,0x111c4}, + {0x111da,0x111da},{0x111dc,0x111dc},{0x11200,0x11211},{0x11213,0x1122b},{0x11280,0x11286}, + {0x11288,0x11288},{0x1128a,0x1128d},{0x1128f,0x1129d},{0x1129f,0x112a8},{0x112b0,0x112de}, + {0x11305,0x1130c},{0x1130f,0x11310},{0x11313,0x11328},{0x1132a,0x11330},{0x11332,0x11333}, + {0x11335,0x11339},{0x1133d,0x1133d},{0x11350,0x11350},{0x1135d,0x11361},{0x11400,0x11434}, + {0x11447,0x1144a},{0x1145f,0x11461},{0x11480,0x114af},{0x114c4,0x114c5},{0x114c7,0x114c7}, + {0x11580,0x115ae},{0x115d8,0x115db},{0x11600,0x1162f},{0x11644,0x11644},{0x11680,0x116aa}, + {0x116b8,0x116b8},{0x11700,0x1171a},{0x11740,0x11746},{0x11800,0x1182b},{0x118a0,0x118df}, + {0x118ff,0x11906},{0x11909,0x11909},{0x1190c,0x11913},{0x11915,0x11916},{0x11918,0x1192f}, + {0x1193f,0x1193f},{0x11941,0x11941},{0x119a0,0x119a7},{0x119aa,0x119d0},{0x119e1,0x119e1}, + {0x119e3,0x119e3},{0x11a00,0x11a00},{0x11a0b,0x11a32},{0x11a3a,0x11a3a},{0x11a50,0x11a50}, + {0x11a5c,0x11a89},{0x11a9d,0x11a9d},{0x11ab0,0x11af8},{0x11c00,0x11c08},{0x11c0a,0x11c2e}, + {0x11c40,0x11c40},{0x11c72,0x11c8f},{0x11d00,0x11d06},{0x11d08,0x11d09},{0x11d0b,0x11d30}, + {0x11d46,0x11d46},{0x11d60,0x11d65},{0x11d67,0x11d68},{0x11d6a,0x11d89},{0x11d98,0x11d98}, + {0x11ee0,0x11ef2},{0x11fb0,0x11fb0},{0x12000,0x12399},{0x12480,0x12543},{0x12f90,0x12ff0}, + {0x13000,0x1342e},{0x14400,0x14646},{0x16800,0x16a38},{0x16a40,0x16a5e},{0x16a70,0x16abe}, + {0x16ad0,0x16aed},{0x16b00,0x16b2f},{0x16b40,0x16b43},{0x16b63,0x16b77},{0x16b7d,0x16b8f}, + {0x16e40,0x16e7f},{0x16f00,0x16f4a},{0x16f50,0x16f50},{0x16f93,0x16f9f},{0x16fe0,0x16fe1}, + {0x16fe3,0x16fe3},{0x17000,0x187f7},{0x18800,0x18cd5},{0x18d00,0x18d08},{0x1aff0,0x1aff3}, + {0x1aff5,0x1affb},{0x1affd,0x1affe},{0x1b000,0x1b122},{0x1b150,0x1b152},{0x1b164,0x1b167}, + {0x1b170,0x1b2fb},{0x1bc00,0x1bc6a},{0x1bc70,0x1bc7c},{0x1bc80,0x1bc88},{0x1bc90,0x1bc99}, + {0x1d400,0x1d454},{0x1d456,0x1d49c},{0x1d49e,0x1d49f},{0x1d4a2,0x1d4a2},{0x1d4a5,0x1d4a6}, + {0x1d4a9,0x1d4ac},{0x1d4ae,0x1d4b9},{0x1d4bb,0x1d4bb},{0x1d4bd,0x1d4c3},{0x1d4c5,0x1d505}, + {0x1d507,0x1d50a},{0x1d50d,0x1d514},{0x1d516,0x1d51c},{0x1d51e,0x1d539},{0x1d53b,0x1d53e}, + {0x1d540,0x1d544},{0x1d546,0x1d546},{0x1d54a,0x1d550},{0x1d552,0x1d6a5},{0x1d6a8,0x1d6c0}, + {0x1d6c2,0x1d6da},{0x1d6dc,0x1d6fa},{0x1d6fc,0x1d714},{0x1d716,0x1d734},{0x1d736,0x1d74e}, + {0x1d750,0x1d76e},{0x1d770,0x1d788},{0x1d78a,0x1d7a8},{0x1d7aa,0x1d7c2},{0x1d7c4,0x1d7cb}, + {0x1df00,0x1df1e},{0x1e100,0x1e12c},{0x1e137,0x1e13d},{0x1e14e,0x1e14e},{0x1e290,0x1e2ad}, + {0x1e2c0,0x1e2eb},{0x1e7e0,0x1e7e6},{0x1e7e8,0x1e7eb},{0x1e7ed,0x1e7ee},{0x1e7f0,0x1e7fe}, + {0x1e800,0x1e8c4},{0x1e900,0x1e943},{0x1e94b,0x1e94b},{0x1ee00,0x1ee03},{0x1ee05,0x1ee1f}, + {0x1ee21,0x1ee22},{0x1ee24,0x1ee24},{0x1ee27,0x1ee27},{0x1ee29,0x1ee32},{0x1ee34,0x1ee37}, + {0x1ee39,0x1ee39},{0x1ee3b,0x1ee3b},{0x1ee42,0x1ee42},{0x1ee47,0x1ee47},{0x1ee49,0x1ee49}, + {0x1ee4b,0x1ee4b},{0x1ee4d,0x1ee4f},{0x1ee51,0x1ee52},{0x1ee54,0x1ee54},{0x1ee57,0x1ee57}, + {0x1ee59,0x1ee59},{0x1ee5b,0x1ee5b},{0x1ee5d,0x1ee5d},{0x1ee5f,0x1ee5f},{0x1ee61,0x1ee62}, + {0x1ee64,0x1ee64},{0x1ee67,0x1ee6a},{0x1ee6c,0x1ee72},{0x1ee74,0x1ee77},{0x1ee79,0x1ee7c}, + {0x1ee7e,0x1ee7e},{0x1ee80,0x1ee89},{0x1ee8b,0x1ee9b},{0x1eea1,0x1eea3},{0x1eea5,0x1eea9}, + {0x1eeab,0x1eebb},{0x20000,0x2a6df},{0x2a700,0x2b738},{0x2b740,0x2b81d},{0x2b820,0x2cea1}, + {0x2ceb0,0x2ebe0},{0x2f800,0x2fa1d},{0x2f800,0x2fa1d} +}; + +static const std::vector> punctuation_ranges = { + {0x21,0x23},{0x25,0x2a},{0x2c,0x2f},{0x3a,0x3b},{0x3f,0x40}, + {0x5b,0x5d},{0x5f,0x5f},{0x7b,0x7b},{0x7d,0x7d},{0xa1,0xa1}, + {0xa7,0xa7},{0xab,0xab},{0xb6,0xb7},{0xbb,0xbb},{0xbf,0xbf}, + {0x37e,0x37e},{0x387,0x387},{0x55a,0x55f},{0x589,0x58a},{0x5be,0x5be}, + {0x5c0,0x5c0},{0x5c3,0x5c3},{0x5c6,0x5c6},{0x5f3,0x5f4},{0x609,0x60a}, + {0x60c,0x60d},{0x61b,0x61b},{0x61d,0x61f},{0x66a,0x66d},{0x6d4,0x6d4}, + {0x700,0x70d},{0x7f7,0x7f9},{0x830,0x83e},{0x85e,0x85e},{0x964,0x965}, + {0x970,0x970},{0x9fd,0x9fd},{0xa76,0xa76},{0xaf0,0xaf0},{0xc77,0xc77}, + {0xc84,0xc84},{0xdf4,0xdf4},{0xe4f,0xe4f},{0xe5a,0xe5b},{0xf04,0xf12}, + {0xf14,0xf14},{0xf3a,0xf3d},{0xf85,0xf85},{0xfd0,0xfd4},{0xfd9,0xfda}, + {0x104a,0x104f},{0x10fb,0x10fb},{0x1360,0x1368},{0x1400,0x1400},{0x166e,0x166e}, + {0x169b,0x169c},{0x16eb,0x16ed},{0x1735,0x1736},{0x17d4,0x17d6},{0x17d8,0x17da}, + {0x1800,0x180a},{0x1944,0x1945},{0x1a1e,0x1a1f},{0x1aa0,0x1aa6},{0x1aa8,0x1aad}, + {0x1b5a,0x1b60},{0x1b7d,0x1b7e},{0x1bfc,0x1bff},{0x1c3b,0x1c3f},{0x1c7e,0x1c7f}, + {0x1cc0,0x1cc7},{0x1cd3,0x1cd3},{0x2010,0x2027},{0x2030,0x2043},{0x2045,0x2051}, + {0x2053,0x205e},{0x207d,0x207e},{0x208d,0x208e},{0x2308,0x230b},{0x2329,0x232a}, + {0x2768,0x2775},{0x27c5,0x27c6},{0x27e6,0x27ef},{0x2983,0x2998},{0x29d8,0x29db}, + {0x29fc,0x29fd},{0x2cf9,0x2cfc},{0x2cfe,0x2cff},{0x2d70,0x2d70},{0x2e00,0x2e2e}, + {0x2e30,0x2e4f},{0x2e52,0x2e5d},{0x3001,0x3003},{0x3008,0x3011},{0x3014,0x301f}, + {0x3030,0x3030},{0x303d,0x303d},{0x30a0,0x30a0},{0x30fb,0x30fb},{0xa4fe,0xa4ff}, + {0xa60d,0xa60f},{0xa673,0xa673},{0xa67e,0xa67e},{0xa6f2,0xa6f7},{0xa874,0xa877}, + {0xa8ce,0xa8cf},{0xa8f8,0xa8fa},{0xa8fc,0xa8fc},{0xa92e,0xa92f},{0xa95f,0xa95f}, + {0xa9c1,0xa9cd},{0xa9de,0xa9df},{0xaa5c,0xaa5f},{0xaade,0xaadf},{0xaaf0,0xaaf1}, + {0xabeb,0xabeb},{0xfd3e,0xfd3f},{0xfe10,0xfe19},{0xfe30,0xfe52},{0xfe54,0xfe61}, + {0xfe63,0xfe63},{0xfe68,0xfe68},{0xfe6a,0xfe6b},{0xff01,0xff03},{0xff05,0xff0a}, + {0xff0c,0xff0f},{0xff1a,0xff1b},{0xff1f,0xff20},{0xff3b,0xff3d},{0xff3f,0xff3f}, + {0xff5b,0xff5b},{0xff5d,0xff5d},{0xff5f,0xff65},{0x10100,0x10102},{0x1039f,0x1039f}, + {0x103d0,0x103d0},{0x1056f,0x1056f},{0x10857,0x10857},{0x1091f,0x1091f},{0x1093f,0x1093f}, + {0x10a50,0x10a58},{0x10a7f,0x10a7f},{0x10af0,0x10af6},{0x10b39,0x10b3f},{0x10b99,0x10b9c}, + {0x10ead,0x10ead},{0x10f55,0x10f59},{0x10f86,0x10f89},{0x11047,0x1104d},{0x110bb,0x110bc}, + {0x110be,0x110c1},{0x11140,0x11143},{0x11174,0x11175},{0x111c5,0x111c8},{0x111cd,0x111cd}, + {0x111db,0x111db},{0x111dd,0x111df},{0x11238,0x1123d},{0x112a9,0x112a9},{0x1144b,0x1144f}, + {0x1145a,0x1145b},{0x1145d,0x1145d},{0x114c6,0x114c6},{0x115c1,0x115d7},{0x11641,0x11643}, + {0x11660,0x1166c},{0x116b9,0x116b9},{0x1173c,0x1173e},{0x1183b,0x1183b},{0x11944,0x11946}, + {0x119e2,0x119e2},{0x11a3f,0x11a46},{0x11a9a,0x11a9c},{0x11a9e,0x11aa2},{0x11c41,0x11c45}, + {0x11c70,0x11c71},{0x11ef7,0x11ef8},{0x11fff,0x11fff},{0x12470,0x12474},{0x12ff1,0x12ff2}, + {0x16a6e,0x16a6f},{0x16af5,0x16af5},{0x16b37,0x16b3b},{0x16b44,0x16b44},{0x16e97,0x16e9a}, + {0x16fe2,0x16fe2},{0x1bc9f,0x1bc9f},{0x1da87,0x1da8b},{0x1da87,0x1da8b} +}; + +static const std::vector> separator_ranges = { + {0x20,0x20},{0xa0,0xa0},{0x1680,0x1680},{0x2000,0x200a},{0x2028,0x2029}, + {0x202f,0x202f},{0x205f,0x205f},{0x205f,0x205f} +}; + +static const std::vector> mark_ranges = { + {0x300,0x36f},{0x483,0x489},{0x591,0x5bd},{0x5bf,0x5bf},{0x5c1,0x5c2}, + {0x5c4,0x5c5},{0x5c7,0x5c7},{0x610,0x61a},{0x64b,0x65f},{0x670,0x670}, + {0x6d6,0x6dc},{0x6df,0x6e4},{0x6e7,0x6e8},{0x6ea,0x6ed},{0x711,0x711}, + {0x730,0x74a},{0x7a6,0x7b0},{0x7eb,0x7f3},{0x7fd,0x7fd},{0x816,0x819}, + {0x81b,0x823},{0x825,0x827},{0x829,0x82d},{0x859,0x85b},{0x898,0x89f}, + {0x8ca,0x8e1},{0x8e3,0x903},{0x93a,0x93c},{0x93e,0x94f},{0x951,0x957}, + {0x962,0x963},{0x981,0x983},{0x9bc,0x9bc},{0x9be,0x9c4},{0x9c7,0x9c8}, + {0x9cb,0x9cd},{0x9d7,0x9d7},{0x9e2,0x9e3},{0x9fe,0x9fe},{0xa01,0xa03}, + {0xa3c,0xa3c},{0xa3e,0xa42},{0xa47,0xa48},{0xa4b,0xa4d},{0xa51,0xa51}, + {0xa70,0xa71},{0xa75,0xa75},{0xa81,0xa83},{0xabc,0xabc},{0xabe,0xac5}, + {0xac7,0xac9},{0xacb,0xacd},{0xae2,0xae3},{0xafa,0xaff},{0xb01,0xb03}, + {0xb3c,0xb3c},{0xb3e,0xb44},{0xb47,0xb48},{0xb4b,0xb4d},{0xb55,0xb57}, + {0xb62,0xb63},{0xb82,0xb82},{0xbbe,0xbc2},{0xbc6,0xbc8},{0xbca,0xbcd}, + {0xbd7,0xbd7},{0xc00,0xc04},{0xc3c,0xc3c},{0xc3e,0xc44},{0xc46,0xc48}, + {0xc4a,0xc4d},{0xc55,0xc56},{0xc62,0xc63},{0xc81,0xc83},{0xcbc,0xcbc}, + {0xcbe,0xcc4},{0xcc6,0xcc8},{0xcca,0xccd},{0xcd5,0xcd6},{0xce2,0xce3}, + {0xd00,0xd03},{0xd3b,0xd3c},{0xd3e,0xd44},{0xd46,0xd48},{0xd4a,0xd4d}, + {0xd57,0xd57},{0xd62,0xd63},{0xd81,0xd83},{0xdca,0xdca},{0xdcf,0xdd4}, + {0xdd6,0xdd6},{0xdd8,0xddf},{0xdf2,0xdf3},{0xe31,0xe31},{0xe34,0xe3a}, + {0xe47,0xe4e},{0xeb1,0xeb1},{0xeb4,0xebc},{0xec8,0xecd},{0xf18,0xf19}, + {0xf35,0xf35},{0xf37,0xf37},{0xf39,0xf39},{0xf3e,0xf3f},{0xf71,0xf84}, + {0xf86,0xf87},{0xf8d,0xf97},{0xf99,0xfbc},{0xfc6,0xfc6},{0x102b,0x103e}, + {0x1056,0x1059},{0x105e,0x1060},{0x1062,0x1064},{0x1067,0x106d},{0x1071,0x1074}, + {0x1082,0x108d},{0x108f,0x108f},{0x109a,0x109d},{0x135d,0x135f},{0x1712,0x1715}, + {0x1732,0x1734},{0x1752,0x1753},{0x1772,0x1773},{0x17b4,0x17d3},{0x17dd,0x17dd}, + {0x180b,0x180d},{0x180f,0x180f},{0x1885,0x1886},{0x18a9,0x18a9},{0x1920,0x192b}, + {0x1930,0x193b},{0x1a17,0x1a1b},{0x1a55,0x1a5e},{0x1a60,0x1a7c},{0x1a7f,0x1a7f}, + {0x1ab0,0x1ace},{0x1b00,0x1b04},{0x1b34,0x1b44},{0x1b6b,0x1b73},{0x1b80,0x1b82}, + {0x1ba1,0x1bad},{0x1be6,0x1bf3},{0x1c24,0x1c37},{0x1cd0,0x1cd2},{0x1cd4,0x1ce8}, + {0x1ced,0x1ced},{0x1cf4,0x1cf4},{0x1cf7,0x1cf9},{0x1dc0,0x1dff},{0x20d0,0x20f0}, + {0x2cef,0x2cf1},{0x2d7f,0x2d7f},{0x2de0,0x2dff},{0x302a,0x302f},{0x3099,0x309a}, + {0xa66f,0xa672},{0xa674,0xa67d},{0xa69e,0xa69f},{0xa6f0,0xa6f1},{0xa802,0xa802}, + {0xa806,0xa806},{0xa80b,0xa80b},{0xa823,0xa827},{0xa82c,0xa82c},{0xa880,0xa881}, + {0xa8b4,0xa8c5},{0xa8e0,0xa8f1},{0xa8ff,0xa8ff},{0xa926,0xa92d},{0xa947,0xa953}, + {0xa980,0xa983},{0xa9b3,0xa9c0},{0xa9e5,0xa9e5},{0xaa29,0xaa36},{0xaa43,0xaa43}, + {0xaa4c,0xaa4d},{0xaa7b,0xaa7d},{0xaab0,0xaab0},{0xaab2,0xaab4},{0xaab7,0xaab8}, + {0xaabe,0xaabf},{0xaac1,0xaac1},{0xaaeb,0xaaef},{0xaaf5,0xaaf6},{0xabe3,0xabea}, + {0xabec,0xabed},{0xfb1e,0xfb1e},{0xfe00,0xfe0f},{0xfe20,0xfe2f},{0x101fd,0x101fd}, + {0x102e0,0x102e0},{0x10376,0x1037a},{0x10a01,0x10a03},{0x10a05,0x10a06},{0x10a0c,0x10a0f}, + {0x10a38,0x10a3a},{0x10a3f,0x10a3f},{0x10ae5,0x10ae6},{0x10d24,0x10d27},{0x10eab,0x10eac}, + {0x10f46,0x10f50},{0x10f82,0x10f85},{0x11000,0x11002},{0x11038,0x11046},{0x11070,0x11070}, + {0x11073,0x11074},{0x1107f,0x11082},{0x110b0,0x110ba},{0x110c2,0x110c2},{0x11100,0x11102}, + {0x11127,0x11134},{0x11145,0x11146},{0x11173,0x11173},{0x11180,0x11182},{0x111b3,0x111c0}, + {0x111c9,0x111cc},{0x111ce,0x111cf},{0x1122c,0x11237},{0x1123e,0x1123e},{0x112df,0x112ea}, + {0x11300,0x11303},{0x1133b,0x1133c},{0x1133e,0x11344},{0x11347,0x11348},{0x1134b,0x1134d}, + {0x11357,0x11357},{0x11362,0x11363},{0x11366,0x1136c},{0x11370,0x11374},{0x11435,0x11446}, + {0x1145e,0x1145e},{0x114b0,0x114c3},{0x115af,0x115b5},{0x115b8,0x115c0},{0x115dc,0x115dd}, + {0x11630,0x11640},{0x116ab,0x116b7},{0x1171d,0x1172b},{0x1182c,0x1183a},{0x11930,0x11935}, + {0x11937,0x11938},{0x1193b,0x1193e},{0x11940,0x11940},{0x11942,0x11943},{0x119d1,0x119d7}, + {0x119da,0x119e0},{0x119e4,0x119e4},{0x11a01,0x11a0a},{0x11a33,0x11a39},{0x11a3b,0x11a3e}, + {0x11a47,0x11a47},{0x11a51,0x11a5b},{0x11a8a,0x11a99},{0x11c2f,0x11c36},{0x11c38,0x11c3f}, + {0x11c92,0x11ca7},{0x11ca9,0x11cb6},{0x11d31,0x11d36},{0x11d3a,0x11d3a},{0x11d3c,0x11d3d}, + {0x11d3f,0x11d45},{0x11d47,0x11d47},{0x11d8a,0x11d8e},{0x11d90,0x11d91},{0x11d93,0x11d97}, + {0x11ef3,0x11ef6},{0x16af0,0x16af4},{0x16b30,0x16b36},{0x16f4f,0x16f4f},{0x16f51,0x16f87}, + {0x16f8f,0x16f92},{0x16fe4,0x16fe4},{0x16ff0,0x16ff1},{0x1bc9d,0x1bc9e},{0x1cf00,0x1cf2d}, + {0x1cf30,0x1cf46},{0x1d165,0x1d169},{0x1d16d,0x1d172},{0x1d17b,0x1d182},{0x1d185,0x1d18b}, + {0x1d1aa,0x1d1ad},{0x1d242,0x1d244},{0x1da00,0x1da36},{0x1da3b,0x1da6c},{0x1da75,0x1da75}, + {0x1da84,0x1da84},{0x1da9b,0x1da9f},{0x1daa1,0x1daaf},{0x1e000,0x1e006},{0x1e008,0x1e018}, + {0x1e01b,0x1e021},{0x1e023,0x1e024},{0x1e026,0x1e02a},{0x1e130,0x1e136},{0x1e2ae,0x1e2ae}, + {0x1e2ec,0x1e2ef},{0x1e8d0,0x1e8d6},{0x1e944,0x1e94a},{0x1e944,0x1e94a} +}; + +static const std::vector> symbol_ranges = { + {0x24,0x24},{0x2b,0x2b},{0x3c,0x3e},{0x5e,0x5e},{0x60,0x60}, + {0x7c,0x7c},{0x7e,0x7e},{0xa2,0xa6},{0xa8,0xa9},{0xac,0xac}, + {0xae,0xb1},{0xb4,0xb4},{0xb8,0xb8},{0xd7,0xd7},{0xf7,0xf7}, + {0x2c2,0x2c5},{0x2d2,0x2df},{0x2e5,0x2eb},{0x2ed,0x2ed},{0x2ef,0x2ff}, + {0x375,0x375},{0x384,0x385},{0x3f6,0x3f6},{0x482,0x482},{0x58d,0x58f}, + {0x606,0x608},{0x60b,0x60b},{0x60e,0x60f},{0x6de,0x6de},{0x6e9,0x6e9}, + {0x6fd,0x6fe},{0x7f6,0x7f6},{0x7fe,0x7ff},{0x888,0x888},{0x9f2,0x9f3}, + {0x9fa,0x9fb},{0xaf1,0xaf1},{0xb70,0xb70},{0xbf3,0xbfa},{0xc7f,0xc7f}, + {0xd4f,0xd4f},{0xd79,0xd79},{0xe3f,0xe3f},{0xf01,0xf03},{0xf13,0xf13}, + {0xf15,0xf17},{0xf1a,0xf1f},{0xf34,0xf34},{0xf36,0xf36},{0xf38,0xf38}, + {0xfbe,0xfc5},{0xfc7,0xfcc},{0xfce,0xfcf},{0xfd5,0xfd8},{0x109e,0x109f}, + {0x1390,0x1399},{0x166d,0x166d},{0x17db,0x17db},{0x1940,0x1940},{0x19de,0x19ff}, + {0x1b61,0x1b6a},{0x1b74,0x1b7c},{0x1fbd,0x1fbd},{0x1fbf,0x1fc1},{0x1fcd,0x1fcf}, + {0x1fdd,0x1fdf},{0x1fed,0x1fef},{0x1ffd,0x1ffe},{0x2044,0x2044},{0x2052,0x2052}, + {0x207a,0x207c},{0x208a,0x208c},{0x20a0,0x20c0},{0x2100,0x2101},{0x2103,0x2106}, + {0x2108,0x2109},{0x2114,0x2114},{0x2116,0x2118},{0x211e,0x2123},{0x2125,0x2125}, + {0x2127,0x2127},{0x2129,0x2129},{0x212e,0x212e},{0x213a,0x213b},{0x2140,0x2144}, + {0x214a,0x214d},{0x214f,0x214f},{0x218a,0x218b},{0x2190,0x2307},{0x230c,0x2328}, + {0x232b,0x2426},{0x2440,0x244a},{0x249c,0x24e9},{0x2500,0x2767},{0x2794,0x27c4}, + {0x27c7,0x27e5},{0x27f0,0x2982},{0x2999,0x29d7},{0x29dc,0x29fb},{0x29fe,0x2b73}, + {0x2b76,0x2b95},{0x2b97,0x2bff},{0x2ce5,0x2cea},{0x2e50,0x2e51},{0x2e80,0x2e99}, + {0x2e9b,0x2ef3},{0x2f00,0x2fd5},{0x2ff0,0x2ffb},{0x3004,0x3004},{0x3012,0x3013}, + {0x3020,0x3020},{0x3036,0x3037},{0x303e,0x303f},{0x309b,0x309c},{0x3190,0x3191}, + {0x3196,0x319f},{0x31c0,0x31e3},{0x3200,0x321e},{0x322a,0x3247},{0x3250,0x3250}, + {0x3260,0x327f},{0x328a,0x32b0},{0x32c0,0x33ff},{0x4dc0,0x4dff},{0xa490,0xa4c6}, + {0xa700,0xa716},{0xa720,0xa721},{0xa789,0xa78a},{0xa828,0xa82b},{0xa836,0xa839}, + {0xaa77,0xaa79},{0xab5b,0xab5b},{0xab6a,0xab6b},{0xfb29,0xfb29},{0xfbb2,0xfbc2}, + {0xfd40,0xfd4f},{0xfdcf,0xfdcf},{0xfdfc,0xfdff},{0xfe62,0xfe62},{0xfe64,0xfe66}, + {0xfe69,0xfe69},{0xff04,0xff04},{0xff0b,0xff0b},{0xff1c,0xff1e},{0xff3e,0xff3e}, + {0xff40,0xff40},{0xff5c,0xff5c},{0xff5e,0xff5e},{0xffe0,0xffe6},{0xffe8,0xffee}, + {0xfffc,0xfffd},{0x10137,0x1013f},{0x10179,0x10189},{0x1018c,0x1018e},{0x10190,0x1019c}, + {0x101a0,0x101a0},{0x101d0,0x101fc},{0x10877,0x10878},{0x10ac8,0x10ac8},{0x1173f,0x1173f}, + {0x11fd5,0x11ff1},{0x16b3c,0x16b3f},{0x16b45,0x16b45},{0x1bc9c,0x1bc9c},{0x1cf50,0x1cfc3}, + {0x1d000,0x1d0f5},{0x1d100,0x1d126},{0x1d129,0x1d164},{0x1d16a,0x1d16c},{0x1d183,0x1d184}, + {0x1d18c,0x1d1a9},{0x1d1ae,0x1d1ea},{0x1d200,0x1d241},{0x1d245,0x1d245},{0x1d300,0x1d356}, + {0x1d6c1,0x1d6c1},{0x1d6db,0x1d6db},{0x1d6fb,0x1d6fb},{0x1d715,0x1d715},{0x1d735,0x1d735}, + {0x1d74f,0x1d74f},{0x1d76f,0x1d76f},{0x1d789,0x1d789},{0x1d7a9,0x1d7a9},{0x1d7c3,0x1d7c3}, + {0x1d800,0x1d9ff},{0x1da37,0x1da3a},{0x1da6d,0x1da74},{0x1da76,0x1da83},{0x1da85,0x1da86}, + {0x1e14f,0x1e14f},{0x1e2ff,0x1e2ff},{0x1ecac,0x1ecac},{0x1ecb0,0x1ecb0},{0x1ed2e,0x1ed2e}, + {0x1eef0,0x1eef1},{0x1f000,0x1f02b},{0x1f030,0x1f093},{0x1f0a0,0x1f0ae},{0x1f0b1,0x1f0bf}, + {0x1f0c1,0x1f0cf},{0x1f0d1,0x1f0f5},{0x1f10d,0x1f1ad},{0x1f1e6,0x1f202},{0x1f210,0x1f23b}, + {0x1f240,0x1f248},{0x1f250,0x1f251},{0x1f260,0x1f265},{0x1f300,0x1f6d7},{0x1f6dd,0x1f6ec}, + {0x1f6f0,0x1f6fc},{0x1f700,0x1f773},{0x1f780,0x1f7d8},{0x1f7e0,0x1f7eb},{0x1f7f0,0x1f7f0}, + {0x1f800,0x1f80b},{0x1f810,0x1f847},{0x1f850,0x1f859},{0x1f860,0x1f887},{0x1f890,0x1f8ad}, + {0x1f8b0,0x1f8b1},{0x1f900,0x1fa53},{0x1fa60,0x1fa6d},{0x1fa70,0x1fa74},{0x1fa78,0x1fa7c}, + {0x1fa80,0x1fa86},{0x1fa90,0x1faac},{0x1fab0,0x1faba},{0x1fac0,0x1fac5},{0x1fad0,0x1fad9}, + {0x1fae0,0x1fae7},{0x1faf0,0x1faf6},{0x1fb00,0x1fb92},{0x1fb00,0x1fb92} +}; + +static const std::vector> other_ranges = { + {0x0,0x1f},{0x7f,0x9f},{0xad,0xad},{0x378,0x379},{0x380,0x383}, + {0x38b,0x38b},{0x38d,0x38d},{0x3a2,0x3a2},{0x530,0x530},{0x557,0x558}, + {0x58b,0x58c},{0x590,0x590},{0x5c8,0x5cf},{0x5eb,0x5ee},{0x5f5,0x605}, + {0x61c,0x61c},{0x6dd,0x6dd},{0x70e,0x70f},{0x74b,0x74c},{0x7b2,0x7bf}, + {0x7fb,0x7fc},{0x82e,0x82f},{0x83f,0x83f},{0x85c,0x85d},{0x85f,0x85f}, + {0x86b,0x86f},{0x88f,0x897},{0x8e2,0x8e2},{0x984,0x984},{0x98d,0x98e}, + {0x991,0x992},{0x9a9,0x9a9},{0x9b1,0x9b1},{0x9b3,0x9b5},{0x9ba,0x9bb}, + {0x9c5,0x9c6},{0x9c9,0x9ca},{0x9cf,0x9d6},{0x9d8,0x9db},{0x9de,0x9de}, + {0x9e4,0x9e5},{0x9ff,0xa00},{0xa04,0xa04},{0xa0b,0xa0e},{0xa11,0xa12}, + {0xa29,0xa29},{0xa31,0xa31},{0xa34,0xa34},{0xa37,0xa37},{0xa3a,0xa3b}, + {0xa3d,0xa3d},{0xa43,0xa46},{0xa49,0xa4a},{0xa4e,0xa50},{0xa52,0xa58}, + {0xa5d,0xa5d},{0xa5f,0xa65},{0xa77,0xa80},{0xa84,0xa84},{0xa8e,0xa8e}, + {0xa92,0xa92},{0xaa9,0xaa9},{0xab1,0xab1},{0xab4,0xab4},{0xaba,0xabb}, + {0xac6,0xac6},{0xaca,0xaca},{0xace,0xacf},{0xad1,0xadf},{0xae4,0xae5}, + {0xaf2,0xaf8},{0xb00,0xb00},{0xb04,0xb04},{0xb0d,0xb0e},{0xb11,0xb12}, + {0xb29,0xb29},{0xb31,0xb31},{0xb34,0xb34},{0xb3a,0xb3b},{0xb45,0xb46}, + {0xb49,0xb4a},{0xb4e,0xb54},{0xb58,0xb5b},{0xb5e,0xb5e},{0xb64,0xb65}, + {0xb78,0xb81},{0xb84,0xb84},{0xb8b,0xb8d},{0xb91,0xb91},{0xb96,0xb98}, + {0xb9b,0xb9b},{0xb9d,0xb9d},{0xba0,0xba2},{0xba5,0xba7},{0xbab,0xbad}, + {0xbba,0xbbd},{0xbc3,0xbc5},{0xbc9,0xbc9},{0xbce,0xbcf},{0xbd1,0xbd6}, + {0xbd8,0xbe5},{0xbfb,0xbff},{0xc0d,0xc0d},{0xc11,0xc11},{0xc29,0xc29}, + {0xc3a,0xc3b},{0xc45,0xc45},{0xc49,0xc49},{0xc4e,0xc54},{0xc57,0xc57}, + {0xc5b,0xc5c},{0xc5e,0xc5f},{0xc64,0xc65},{0xc70,0xc76},{0xc8d,0xc8d}, + {0xc91,0xc91},{0xca9,0xca9},{0xcb4,0xcb4},{0xcba,0xcbb},{0xcc5,0xcc5}, + {0xcc9,0xcc9},{0xcce,0xcd4},{0xcd7,0xcdc},{0xcdf,0xcdf},{0xce4,0xce5}, + {0xcf0,0xcf0},{0xcf3,0xcff},{0xd0d,0xd0d},{0xd11,0xd11},{0xd45,0xd45}, + {0xd49,0xd49},{0xd50,0xd53},{0xd64,0xd65},{0xd80,0xd80},{0xd84,0xd84}, + {0xd97,0xd99},{0xdb2,0xdb2},{0xdbc,0xdbc},{0xdbe,0xdbf},{0xdc7,0xdc9}, + {0xdcb,0xdce},{0xdd5,0xdd5},{0xdd7,0xdd7},{0xde0,0xde5},{0xdf0,0xdf1}, + {0xdf5,0xe00},{0xe3b,0xe3e},{0xe5c,0xe80},{0xe83,0xe83},{0xe85,0xe85}, + {0xe8b,0xe8b},{0xea4,0xea4},{0xea6,0xea6},{0xebe,0xebf},{0xec5,0xec5}, + {0xec7,0xec7},{0xece,0xecf},{0xeda,0xedb},{0xee0,0xeff},{0xf48,0xf48}, + {0xf6d,0xf70},{0xf98,0xf98},{0xfbd,0xfbd},{0xfcd,0xfcd},{0xfdb,0xfff}, + {0x10c6,0x10c6},{0x10c8,0x10cc},{0x10ce,0x10cf},{0x1249,0x1249},{0x124e,0x124f}, + {0x1257,0x1257},{0x1259,0x1259},{0x125e,0x125f},{0x1289,0x1289},{0x128e,0x128f}, + {0x12b1,0x12b1},{0x12b6,0x12b7},{0x12bf,0x12bf},{0x12c1,0x12c1},{0x12c6,0x12c7}, + {0x12d7,0x12d7},{0x1311,0x1311},{0x1316,0x1317},{0x135b,0x135c},{0x137d,0x137f}, + {0x139a,0x139f},{0x13f6,0x13f7},{0x13fe,0x13ff},{0x169d,0x169f},{0x16f9,0x16ff}, + {0x1716,0x171e},{0x1737,0x173f},{0x1754,0x175f},{0x176d,0x176d},{0x1771,0x1771}, + {0x1774,0x177f},{0x17de,0x17df},{0x17ea,0x17ef},{0x17fa,0x17ff},{0x180e,0x180e}, + {0x181a,0x181f},{0x1879,0x187f},{0x18ab,0x18af},{0x18f6,0x18ff},{0x191f,0x191f}, + {0x192c,0x192f},{0x193c,0x193f},{0x1941,0x1943},{0x196e,0x196f},{0x1975,0x197f}, + {0x19ac,0x19af},{0x19ca,0x19cf},{0x19db,0x19dd},{0x1a1c,0x1a1d},{0x1a5f,0x1a5f}, + {0x1a7d,0x1a7e},{0x1a8a,0x1a8f},{0x1a9a,0x1a9f},{0x1aae,0x1aaf},{0x1acf,0x1aff}, + {0x1b4d,0x1b4f},{0x1b7f,0x1b7f},{0x1bf4,0x1bfb},{0x1c38,0x1c3a},{0x1c4a,0x1c4c}, + {0x1c89,0x1c8f},{0x1cbb,0x1cbc},{0x1cc8,0x1ccf},{0x1cfb,0x1cff},{0x1f16,0x1f17}, + {0x1f1e,0x1f1f},{0x1f46,0x1f47},{0x1f4e,0x1f4f},{0x1f58,0x1f58},{0x1f5a,0x1f5a}, + {0x1f5c,0x1f5c},{0x1f5e,0x1f5e},{0x1f7e,0x1f7f},{0x1fb5,0x1fb5},{0x1fc5,0x1fc5}, + {0x1fd4,0x1fd5},{0x1fdc,0x1fdc},{0x1ff0,0x1ff1},{0x1ff5,0x1ff5},{0x1fff,0x1fff}, + {0x200b,0x200f},{0x202a,0x202e},{0x2060,0x206f},{0x2072,0x2073},{0x208f,0x208f}, + {0x209d,0x209f},{0x20c1,0x20cf},{0x20f1,0x20ff},{0x218c,0x218f},{0x2427,0x243f}, + {0x244b,0x245f},{0x2b74,0x2b75},{0x2b96,0x2b96},{0x2cf4,0x2cf8},{0x2d26,0x2d26}, + {0x2d28,0x2d2c},{0x2d2e,0x2d2f},{0x2d68,0x2d6e},{0x2d71,0x2d7e},{0x2d97,0x2d9f}, + {0x2da7,0x2da7},{0x2daf,0x2daf},{0x2db7,0x2db7},{0x2dbf,0x2dbf},{0x2dc7,0x2dc7}, + {0x2dcf,0x2dcf},{0x2dd7,0x2dd7},{0x2ddf,0x2ddf},{0x2e5e,0x2e7f},{0x2e9a,0x2e9a}, + {0x2ef4,0x2eff},{0x2fd6,0x2fef},{0x2ffc,0x2fff},{0x3040,0x3040},{0x3097,0x3098}, + {0x3100,0x3104},{0x3130,0x3130},{0x318f,0x318f},{0x31e4,0x31ef},{0x321f,0x321f}, + {0xa48d,0xa48f},{0xa4c7,0xa4cf},{0xa62c,0xa63f},{0xa6f8,0xa6ff},{0xa7cb,0xa7cf}, + {0xa7d2,0xa7d2},{0xa7d4,0xa7d4},{0xa7da,0xa7f1},{0xa82d,0xa82f},{0xa83a,0xa83f}, + {0xa878,0xa87f},{0xa8c6,0xa8cd},{0xa8da,0xa8df},{0xa954,0xa95e},{0xa97d,0xa97f}, + {0xa9ce,0xa9ce},{0xa9da,0xa9dd},{0xa9ff,0xa9ff},{0xaa37,0xaa3f},{0xaa4e,0xaa4f}, + {0xaa5a,0xaa5b},{0xaac3,0xaada},{0xaaf7,0xab00},{0xab07,0xab08},{0xab0f,0xab10}, + {0xab17,0xab1f},{0xab27,0xab27},{0xab2f,0xab2f},{0xab6c,0xab6f},{0xabee,0xabef}, + {0xabfa,0xabff},{0xd7a4,0xd7af},{0xd7c7,0xd7ca},{0xd7fc,0xf8ff},{0xfa6e,0xfa6f}, + {0xfada,0xfaff},{0xfb07,0xfb12},{0xfb18,0xfb1c},{0xfb37,0xfb37},{0xfb3d,0xfb3d}, + {0xfb3f,0xfb3f},{0xfb42,0xfb42},{0xfb45,0xfb45},{0xfbc3,0xfbd2},{0xfd90,0xfd91}, + {0xfdc8,0xfdce},{0xfdd0,0xfdef},{0xfe1a,0xfe1f},{0xfe53,0xfe53},{0xfe67,0xfe67}, + {0xfe6c,0xfe6f},{0xfe75,0xfe75},{0xfefd,0xff00},{0xffbf,0xffc1},{0xffc8,0xffc9}, + {0xffd0,0xffd1},{0xffd8,0xffd9},{0xffdd,0xffdf},{0xffe7,0xffe7},{0xffef,0xfffb}, + {0xfffe,0xffff},{0x1000c,0x1000c},{0x10027,0x10027},{0x1003b,0x1003b},{0x1003e,0x1003e}, + {0x1004e,0x1004f},{0x1005e,0x1007f},{0x100fb,0x100ff},{0x10103,0x10106},{0x10134,0x10136}, + {0x1018f,0x1018f},{0x1019d,0x1019f},{0x101a1,0x101cf},{0x101fe,0x1027f},{0x1029d,0x1029f}, + {0x102d1,0x102df},{0x102fc,0x102ff},{0x10324,0x1032c},{0x1034b,0x1034f},{0x1037b,0x1037f}, + {0x1039e,0x1039e},{0x103c4,0x103c7},{0x103d6,0x103ff},{0x1049e,0x1049f},{0x104aa,0x104af}, + {0x104d4,0x104d7},{0x104fc,0x104ff},{0x10528,0x1052f},{0x10564,0x1056e},{0x1057b,0x1057b}, + {0x1058b,0x1058b},{0x10593,0x10593},{0x10596,0x10596},{0x105a2,0x105a2},{0x105b2,0x105b2}, + {0x105ba,0x105ba},{0x105bd,0x105ff},{0x10737,0x1073f},{0x10756,0x1075f},{0x10768,0x1077f}, + {0x10786,0x10786},{0x107b1,0x107b1},{0x107bb,0x107ff},{0x10806,0x10807},{0x10809,0x10809}, + {0x10836,0x10836},{0x10839,0x1083b},{0x1083d,0x1083e},{0x10856,0x10856},{0x1089f,0x108a6}, + {0x108b0,0x108df},{0x108f3,0x108f3},{0x108f6,0x108fa},{0x1091c,0x1091e},{0x1093a,0x1093e}, + {0x10940,0x1097f},{0x109b8,0x109bb},{0x109d0,0x109d1},{0x10a04,0x10a04},{0x10a07,0x10a0b}, + {0x10a14,0x10a14},{0x10a18,0x10a18},{0x10a36,0x10a37},{0x10a3b,0x10a3e},{0x10a49,0x10a4f}, + {0x10a59,0x10a5f},{0x10aa0,0x10abf},{0x10ae7,0x10aea},{0x10af7,0x10aff},{0x10b36,0x10b38}, + {0x10b56,0x10b57},{0x10b73,0x10b77},{0x10b92,0x10b98},{0x10b9d,0x10ba8},{0x10bb0,0x10bff}, + {0x10c49,0x10c7f},{0x10cb3,0x10cbf},{0x10cf3,0x10cf9},{0x10d28,0x10d2f},{0x10d3a,0x10e5f}, + {0x10e7f,0x10e7f},{0x10eaa,0x10eaa},{0x10eae,0x10eaf},{0x10eb2,0x10eff},{0x10f28,0x10f2f}, + {0x10f5a,0x10f6f},{0x10f8a,0x10faf},{0x10fcc,0x10fdf},{0x10ff7,0x10fff},{0x1104e,0x11051}, + {0x11076,0x1107e},{0x110bd,0x110bd},{0x110c3,0x110cf},{0x110e9,0x110ef},{0x110fa,0x110ff}, + {0x11135,0x11135},{0x11148,0x1114f},{0x11177,0x1117f},{0x111e0,0x111e0},{0x111f5,0x111ff}, + {0x11212,0x11212},{0x1123f,0x1127f},{0x11287,0x11287},{0x11289,0x11289},{0x1128e,0x1128e}, + {0x1129e,0x1129e},{0x112aa,0x112af},{0x112eb,0x112ef},{0x112fa,0x112ff},{0x11304,0x11304}, + {0x1130d,0x1130e},{0x11311,0x11312},{0x11329,0x11329},{0x11331,0x11331},{0x11334,0x11334}, + {0x1133a,0x1133a},{0x11345,0x11346},{0x11349,0x1134a},{0x1134e,0x1134f},{0x11351,0x11356}, + {0x11358,0x1135c},{0x11364,0x11365},{0x1136d,0x1136f},{0x11375,0x113ff},{0x1145c,0x1145c}, + {0x11462,0x1147f},{0x114c8,0x114cf},{0x114da,0x1157f},{0x115b6,0x115b7},{0x115de,0x115ff}, + {0x11645,0x1164f},{0x1165a,0x1165f},{0x1166d,0x1167f},{0x116ba,0x116bf},{0x116ca,0x116ff}, + {0x1171b,0x1171c},{0x1172c,0x1172f},{0x11747,0x117ff},{0x1183c,0x1189f},{0x118f3,0x118fe}, + {0x11907,0x11908},{0x1190a,0x1190b},{0x11914,0x11914},{0x11917,0x11917},{0x11936,0x11936}, + {0x11939,0x1193a},{0x11947,0x1194f},{0x1195a,0x1199f},{0x119a8,0x119a9},{0x119d8,0x119d9}, + {0x119e5,0x119ff},{0x11a48,0x11a4f},{0x11aa3,0x11aaf},{0x11af9,0x11bff},{0x11c09,0x11c09}, + {0x11c37,0x11c37},{0x11c46,0x11c4f},{0x11c6d,0x11c6f},{0x11c90,0x11c91},{0x11ca8,0x11ca8}, + {0x11cb7,0x11cff},{0x11d07,0x11d07},{0x11d0a,0x11d0a},{0x11d37,0x11d39},{0x11d3b,0x11d3b}, + {0x11d3e,0x11d3e},{0x11d48,0x11d4f},{0x11d5a,0x11d5f},{0x11d66,0x11d66},{0x11d69,0x11d69}, + {0x11d8f,0x11d8f},{0x11d92,0x11d92},{0x11d99,0x11d9f},{0x11daa,0x11edf},{0x11ef9,0x11faf}, + {0x11fb1,0x11fbf},{0x11ff2,0x11ffe},{0x1239a,0x123ff},{0x1246f,0x1246f},{0x12475,0x1247f}, + {0x12544,0x12f8f},{0x12ff3,0x12fff},{0x1342f,0x143ff},{0x14647,0x167ff},{0x16a39,0x16a3f}, + {0x16a5f,0x16a5f},{0x16a6a,0x16a6d},{0x16abf,0x16abf},{0x16aca,0x16acf},{0x16aee,0x16aef}, + {0x16af6,0x16aff},{0x16b46,0x16b4f},{0x16b5a,0x16b5a},{0x16b62,0x16b62},{0x16b78,0x16b7c}, + {0x16b90,0x16e3f},{0x16e9b,0x16eff},{0x16f4b,0x16f4e},{0x16f88,0x16f8e},{0x16fa0,0x16fdf}, + {0x16fe5,0x16fef},{0x16ff2,0x16fff},{0x187f8,0x187ff},{0x18cd6,0x18cff},{0x18d09,0x1afef}, + {0x1aff4,0x1aff4},{0x1affc,0x1affc},{0x1afff,0x1afff},{0x1b123,0x1b14f},{0x1b153,0x1b163}, + {0x1b168,0x1b16f},{0x1b2fc,0x1bbff},{0x1bc6b,0x1bc6f},{0x1bc7d,0x1bc7f},{0x1bc89,0x1bc8f}, + {0x1bc9a,0x1bc9b},{0x1bca0,0x1ceff},{0x1cf2e,0x1cf2f},{0x1cf47,0x1cf4f},{0x1cfc4,0x1cfff}, + {0x1d0f6,0x1d0ff},{0x1d127,0x1d128},{0x1d173,0x1d17a},{0x1d1eb,0x1d1ff},{0x1d246,0x1d2df}, + {0x1d2f4,0x1d2ff},{0x1d357,0x1d35f},{0x1d379,0x1d3ff},{0x1d455,0x1d455},{0x1d49d,0x1d49d}, + {0x1d4a0,0x1d4a1},{0x1d4a3,0x1d4a4},{0x1d4a7,0x1d4a8},{0x1d4ad,0x1d4ad},{0x1d4ba,0x1d4ba}, + {0x1d4bc,0x1d4bc},{0x1d4c4,0x1d4c4},{0x1d506,0x1d506},{0x1d50b,0x1d50c},{0x1d515,0x1d515}, + {0x1d51d,0x1d51d},{0x1d53a,0x1d53a},{0x1d53f,0x1d53f},{0x1d545,0x1d545},{0x1d547,0x1d549}, + {0x1d551,0x1d551},{0x1d6a6,0x1d6a7},{0x1d7cc,0x1d7cd},{0x1da8c,0x1da9a},{0x1daa0,0x1daa0}, + {0x1dab0,0x1deff},{0x1df1f,0x1dfff},{0x1e007,0x1e007},{0x1e019,0x1e01a},{0x1e022,0x1e022}, + {0x1e025,0x1e025},{0x1e02b,0x1e0ff},{0x1e12d,0x1e12f},{0x1e13e,0x1e13f},{0x1e14a,0x1e14d}, + {0x1e150,0x1e28f},{0x1e2af,0x1e2bf},{0x1e2fa,0x1e2fe},{0x1e300,0x1e7df},{0x1e7e7,0x1e7e7}, + {0x1e7ec,0x1e7ec},{0x1e7ef,0x1e7ef},{0x1e7ff,0x1e7ff},{0x1e8c5,0x1e8c6},{0x1e8d7,0x1e8ff}, + {0x1e94c,0x1e94f},{0x1e95a,0x1e95d},{0x1e960,0x1ec70},{0x1ecb5,0x1ed00},{0x1ed3e,0x1edff}, + {0x1ee04,0x1ee04},{0x1ee20,0x1ee20},{0x1ee23,0x1ee23},{0x1ee25,0x1ee26},{0x1ee28,0x1ee28}, + {0x1ee33,0x1ee33},{0x1ee38,0x1ee38},{0x1ee3a,0x1ee3a},{0x1ee3c,0x1ee41},{0x1ee43,0x1ee46}, + {0x1ee48,0x1ee48},{0x1ee4a,0x1ee4a},{0x1ee4c,0x1ee4c},{0x1ee50,0x1ee50},{0x1ee53,0x1ee53}, + {0x1ee55,0x1ee56},{0x1ee58,0x1ee58},{0x1ee5a,0x1ee5a},{0x1ee5c,0x1ee5c},{0x1ee5e,0x1ee5e}, + {0x1ee60,0x1ee60},{0x1ee63,0x1ee63},{0x1ee65,0x1ee66},{0x1ee6b,0x1ee6b},{0x1ee73,0x1ee73}, + {0x1ee78,0x1ee78},{0x1ee7d,0x1ee7d},{0x1ee7f,0x1ee7f},{0x1ee8a,0x1ee8a},{0x1ee9c,0x1eea0}, + {0x1eea4,0x1eea4},{0x1eeaa,0x1eeaa},{0x1eebc,0x1eeef},{0x1eef2,0x1efff},{0x1f02c,0x1f02f}, + {0x1f094,0x1f09f},{0x1f0af,0x1f0b0},{0x1f0c0,0x1f0c0},{0x1f0d0,0x1f0d0},{0x1f0f6,0x1f0ff}, + {0x1f1ae,0x1f1e5},{0x1f203,0x1f20f},{0x1f23c,0x1f23f},{0x1f249,0x1f24f},{0x1f252,0x1f25f}, + {0x1f266,0x1f2ff},{0x1f6d8,0x1f6dc},{0x1f6ed,0x1f6ef},{0x1f6fd,0x1f6ff},{0x1f774,0x1f77f}, + {0x1f7d9,0x1f7df},{0x1f7ec,0x1f7ef},{0x1f7f1,0x1f7ff},{0x1f80c,0x1f80f},{0x1f848,0x1f84f}, + {0x1f85a,0x1f85f},{0x1f888,0x1f88f},{0x1f8ae,0x1f8af},{0x1f8b2,0x1f8ff},{0x1fa54,0x1fa5f}, + {0x1fa6e,0x1fa6f},{0x1fa75,0x1fa77},{0x1fa7d,0x1fa7f},{0x1fa87,0x1fa8f},{0x1faad,0x1faaf}, + {0x1fabb,0x1fabf},{0x1fac6,0x1facf},{0x1fada,0x1fadf},{0x1fae8,0x1faef},{0x1faf7,0x1faff}, + {0x1fb93,0x1fb93},{0x1fbcb,0x1fbef},{0x1fbfa,0x1ffff},{0x2a6e0,0x2a6ff},{0x2b739,0x2b73f}, + {0x2b81e,0x2b81f},{0x2cea2,0x2ceaf},{0x2ebe1,0x2f7ff},{0x2fa1e,0x2ffff},{0x3134b,0xe00ff}, + {0x3134b,0xe00ff} +}; + +// This category is not official and is only used for regex purposes +static const std::vector> whitespace_ranges = { + {0x0009, 0x0009}, {0x000A, 0x000A}, {0x000B, 0x000B}, {0x000C, 0x000C}, + {0x000D, 0x000D}, {0x0020, 0x0020}, {0x85, 0x85}, {0xa0, 0xa0}, {0x1680, 0x1680}, + {0x2000, 0x200a}, {0x2028, 0x2029}, {0x202f, 0x202f}, {0x205f, 0x205f}, {0x3000, 0x3000} +}; + +static std::vector> all_ranges; + +static std::map, int> codepoint_type_map; + +static std::string codepoint_to_utf8(uint32_t cp) { + std::string result; + if (/* 0x00 <= cp && */ cp <= 0x7f) { + result.push_back(cp); + } + else if (0x80 <= cp && cp <= 0x7ff) { + result.push_back(0xc0 | ((cp >> 6) & 0x1f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else if (0x800 <= cp && cp <= 0xffff) { + result.push_back(0xe0 | ((cp >> 12) & 0x0f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else if (0x10000 <= cp && cp <= 0x10ffff) { + result.push_back(0xf0 | ((cp >> 18) & 0x07)); + result.push_back(0x80 | ((cp >> 12) & 0x3f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } + else { + throw std::invalid_argument("invalid codepoint"); + } + return result; +} + +static std::string codepoints_to_utf8(const std::vector & cps) { + std::string result; + for (size_t i = 0; i < cps.size(); ++i) { + result.append(codepoint_to_utf8(cps[i])); + } + return result; +} + +static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { + assert(offset < utf8.size()); + if (!(utf8[offset + 0] & 0x80)) { + auto result = utf8[offset + 0]; + offset += 1; + return result; + } + else if (!(utf8[offset + 0] & 0x40)) { + throw std::invalid_argument("invalid character"); + } + else if (!(utf8[offset + 0] & 0x20)) { + if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); + offset += 2; + return result; + } + else if (!(utf8[offset + 0] & 0x10)) { + if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); + offset += 3; + return result; + } + else if (!(utf8[offset + 0] & 0x08)) { + if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { + throw std::invalid_argument("invalid character"); + } + auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); + offset += 4; + return result; + } + throw std::invalid_argument("invalid string"); +} + +static std::vector codepoints_from_utf8(const std::string & utf8) { + std::vector result; + size_t offset = 0; + while (offset < utf8.size()) { + result.push_back(codepoint_from_utf8(utf8, offset)); + } + return result; +} + + + +#define CODEPOINT_TYPE_UNIDENTIFIED 0 +#define CODEPOINT_TYPE_OTHER 1 +#define CODEPOINT_TYPE_NUMBER 2 +#define CODEPOINT_TYPE_LETTER 3 +#define CODEPOINT_TYPE_PUNCTUATION 4 +#define CODEPOINT_TYPE_MARK 5 +#define CODEPOINT_TYPE_SEPARATOR 6 +#define CODEPOINT_TYPE_SYMBOL 7 + + +static bool codepoint_type_init_map() { + for (auto i : other_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_OTHER; + } + for (auto i : number_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_NUMBER; + } + for (auto i : letter_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_LETTER; + } + for (auto i : punctuation_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_PUNCTUATION; + } + for (auto i : mark_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_MARK; + } + for (auto i : separator_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_SEPARATOR; + } + for (auto i : symbol_ranges) { + codepoint_type_map[i] = CODEPOINT_TYPE_SYMBOL; + } + return true; +} + +static bool codepoint_type_init_search_vector() { + all_ranges.insert(all_ranges.end(), other_ranges.begin(), other_ranges.end()); + all_ranges.insert(all_ranges.end(), number_ranges.begin(), number_ranges.end()); + all_ranges.insert(all_ranges.end(), letter_ranges.begin(), letter_ranges.end()); + all_ranges.insert(all_ranges.end(), punctuation_ranges.begin(), punctuation_ranges.end()); + all_ranges.insert(all_ranges.end(), mark_ranges.begin(), mark_ranges.end()); + all_ranges.insert(all_ranges.end(), separator_ranges.begin(), separator_ranges.end()); + all_ranges.insert(all_ranges.end(), symbol_ranges.begin(), symbol_ranges.end()); + std::sort(all_ranges.begin(), all_ranges.end()); + return true; +} + +static size_t binary_search_implement(uint32_t cp, const std::vector> & ranges) { + size_t left = 0; + size_t right = ranges.size() - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + const auto& range = ranges[mid]; + + if (cp >= range.first && cp <= range.second) { + // Target is within the range of the current pair. + return mid; + } else if (cp < range.first) { + // Target is less than the start of the range, search in the left half. + right = mid - 1; + } else { + // Target is greater than the end of the range, search in the right half. + left = mid + 1; + } + } + throw std::runtime_error("Target out of range!"); +} + +static int codepoint_type_binary_search(uint32_t cp) { + try { + auto result = binary_search_implement(cp, all_ranges); + return codepoint_type_map[all_ranges[result]]; + } catch (const std::runtime_error & e) { + return CODEPOINT_TYPE_UNIDENTIFIED; + } +} + +static bool codepoint_type_init() { + bool map_initialized = codepoint_type_init_map(); + bool sv_initialized = codepoint_type_init_search_vector(); + + if(map_initialized && sv_initialized) { + return true; + } + + return false; +} + +static int codepoint_type(uint32_t cp) { + static bool codepoint_type_initialized = codepoint_type_init(); + return codepoint_type_binary_search(cp); +} + +static int codepoint_type(const std::string & utf8) { + if (utf8.length() == 0) { + return CODEPOINT_TYPE_UNIDENTIFIED; + } + return codepoint_type(codepoints_from_utf8(utf8).back()); +} + +static bool codepoint_is_whitespace(const std::string & utf8) { + if (utf8.length() == 0) { + return false; + } + + try { + binary_search_implement(codepoints_from_utf8(utf8).back(), whitespace_ranges); + return true; + } catch (const std::runtime_error & e) { + return false; + } +} + +static std::unordered_map bytes_to_unicode_map_bpe() { + std::unordered_map map; + for (int ch = u'!'; ch <= u'~'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = u'¡'; ch <= u'¬'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(ch) == map.end()) { + map[ch] = codepoint_to_utf8(256 + n); + ++n; + } + } + return map; +} + +static std::string bytes_to_unicode_bpe(uint8_t byte) { + static std::unordered_map map = bytes_to_unicode_map_bpe(); + return map.at(byte); +} + +static std::unordered_map unicode_to_bytes_map_bpe() { + std::unordered_map map; + for (int ch = u'!'; ch <= u'~'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = u'¡'; ch <= u'¬'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = u'®'; ch <= u'ÿ'; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(codepoint_to_utf8(ch)) == map.end()) { + map[codepoint_to_utf8(256 + n)] = ch; + ++n; + } + } + return map; +} + +static uint8_t unicode_to_bytes_bpe(const std::string & utf8) { + static std::unordered_map map = unicode_to_bytes_map_bpe(); + return map.at(utf8); +} + + diff --git a/whisper.cpp b/whisper.cpp index 28e3804f68f..4df8e9e577c 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1,4 +1,5 @@ #include "whisper.h" +#include "unicode.h" #ifdef WHISPER_USE_COREML #include "coreml/whisper-encoder.h" @@ -2928,6 +2929,167 @@ static bool log_mel_spectrogram( return true; } +// Algorithm for byte pair encoding +// BPE essentially first breaks down a string into bytes +// and then merges adjacent bytes together according to rules +// until they can no longer be merged +// This algo is for educational purposes and not optimized for high performance +static std::vector bpe_encode(const whisper_vocab & vocab, const std::string & word) { + std::vector tokens; + tokens.reserve(word.size()); + + // split each word into an array of single byte + for (int pos=0; pos < word.size(); pos++) { + tokens.push_back(vocab.token_to_id.at(word.substr(pos, 1))); + } + + while (true) { + int min_idx = -1; + int min_rank = -1; + + // iterate over all pairs and find the pair we want to merge the most + for (int pos=0; pos < tokens.size() - 1; pos++) { + auto query = vocab.id_to_token.at(tokens[pos]) + vocab.id_to_token.at(tokens[pos+1]); + auto it = vocab.token_to_id.find(query); + if (it != vocab.token_to_id.end()) { + auto rank = it->second; + // find the pair with the lowest rank + if (min_rank == -1 || rank < min_rank) { + min_idx = pos; + min_rank = rank; + } + } + } + + // if there were no pairs we could merge, we're done! + if (min_rank == -1) { + break; + } + + // update token vector + tokens[min_idx] = min_rank; + tokens.erase(tokens.begin() + min_idx + 1); + } + + return tokens; +} + +// 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ +static std::vector bpe_gpt2_preprocess(const std::string & text) { + std::vector bpe_words; + std::vector text_utf8; + + auto cps = codepoints_from_utf8(text); + text_utf8.reserve(cps.size()); + + for (auto cp : cps) { + text_utf8.emplace_back(codepoint_to_utf8(cp)); + } + + size_t i = 0; + while (i < text_utf8.size()) { + const std::string & utf8_char = text_utf8[i]; + size_t bytes_remain = text_utf8.size() - i; + // forward backward lookups + const std::string & utf8_char_next = (i + 1 < text_utf8.size()) ? text_utf8[i + 1] : ""; + const std::string & utf8_char_next_next = (i + 2 < text_utf8.size()) ? text_utf8[i + 2] : ""; + + bool collect_letter = false; + bool collect_number = false; + bool collect_special = false; + bool collect_whitespace = false; + + //'s|'t|'re|'ve|'m|'ll|'d + if (utf8_char == "'") { + if (utf8_char_next == "s" || utf8_char_next == "t" || utf8_char_next == "m" || utf8_char_next == "d") { + bpe_words.push_back(utf8_char+utf8_char_next); + i += 2; + continue; + } else if (((utf8_char_next == "r" || utf8_char_next == "v") && utf8_char_next_next == "e") || (utf8_char_next == "l" && utf8_char_next_next == "l")) { + bpe_words.push_back(utf8_char+utf8_char_next+utf8_char_next_next); + i += 3; + continue; + } + } + + auto codepoint_type_utf8_char = codepoint_type(utf8_char); + auto codepoint_type_utf8_char_next = codepoint_type(utf8_char_next); + auto utf8_char_is_whitespace = codepoint_is_whitespace(utf8_char); + auto utf8_char_next_is_whitespace = codepoint_is_whitespace(utf8_char_next); + std::string word; + std::string buffer; + std::string buffer_next; + + if (codepoint_type_utf8_char == CODEPOINT_TYPE_LETTER || (utf8_char == " " && codepoint_type_utf8_char_next == CODEPOINT_TYPE_LETTER)) { // ?\p{L}+ + collect_letter = true; + } else if (codepoint_type_utf8_char == CODEPOINT_TYPE_NUMBER || (utf8_char == " " && codepoint_type_utf8_char_next == CODEPOINT_TYPE_NUMBER)) { // ?\p{N}+ + collect_number = true; + } else if ((!utf8_char_is_whitespace && codepoint_type_utf8_char != CODEPOINT_TYPE_LETTER && codepoint_type_utf8_char != CODEPOINT_TYPE_NUMBER) || (utf8_char == " " && !utf8_char_next_is_whitespace && codepoint_type_utf8_char_next != CODEPOINT_TYPE_LETTER && codepoint_type_utf8_char_next != CODEPOINT_TYPE_NUMBER)) { // ?[^\s\p{L}\p{N}]+ + collect_special = true; + } else if (utf8_char_is_whitespace) { //\s+(?!\S)|\s+ + collect_whitespace = true; + } + + if (collect_letter || collect_number || collect_special || collect_whitespace) { + word += utf8_char; + i++; + } + + if (collect_letter) { + while (i < text_utf8.size()) { + if (codepoint_type(buffer) == CODEPOINT_TYPE_LETTER) { + word += buffer; + i++; + } else { + break; + } + } + } else if (collect_number) { + while (i < text_utf8.size()) { + buffer = text_utf8[i]; + if (codepoint_type(buffer) == CODEPOINT_TYPE_NUMBER) { + word += buffer; + i++; + } else { + break; + } + } + } else if (collect_special) { + while (i < text_utf8.size()) { + buffer = text_utf8[i]; + auto codepoint_type_buffer = codepoint_type(buffer); + auto buffer_is_whitespace = codepoint_is_whitespace(buffer); + if (!buffer_is_whitespace && codepoint_type_buffer != CODEPOINT_TYPE_LETTER && codepoint_type_buffer != CODEPOINT_TYPE_NUMBER) { + word += buffer; + i++; + } else { + break; + } + } + } else if (collect_whitespace) { + while (i < text_utf8.size()) { + buffer = text_utf8[i]; + buffer_next = (i + 1 < text_utf8.size()) ? text_utf8[i + 1] : ""; + auto buffer_is_whitespace = codepoint_is_whitespace(buffer); + auto buffer_next_is_whitespace = codepoint_is_whitespace(buffer_next); + if ((!buffer_next.empty() && buffer_is_whitespace && buffer_next_is_whitespace) || (buffer_next.empty() && buffer_is_whitespace)) { + word += buffer; + i++; + } else { + break; + } + } + } + + if (!word.empty()) { + bpe_words.push_back(word); + } + } + + return bpe_words; +} + + // split text into tokens // // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 @@ -2936,53 +3098,23 @@ static bool log_mel_spectrogram( // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" // // Regex (C++): -// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" +// R"('s|'t|'re|'ve|'m|'ll|'d| ?[a-zA-Z]+| ?[0-9]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)" +// this remains ineffective in C++ as std::regex does not provide support for Unicode properties +// ref: https://stackoverflow.com/a/38002322 +// so we chose to implement our own regex algorithm to solve this problem +// bpe_gpt2_preprocess // static std::vector tokenize(const whisper_vocab & vocab, const std::string & text) { - std::vector words; // first split the text into words - { - std::string str = text; - std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; - - std::regex re(pat); - std::smatch m; + // this enables parallel processing + auto words = bpe_gpt2_preprocess(" " + text); - while (std::regex_search(str, m, re)) { - for (auto x : m) { - words.push_back(x); - } - str = m.suffix(); - } - } - - // find the longest tokens that form the words: std::vector tokens; + for (const auto & word : words) { - if (word.empty()) continue; - - int i = 0; - int n = word.size(); - while (i < n) { - int j = n; - bool found = false; - while (j > i) { - auto sub = word.substr(i, j-i); - auto it = vocab.token_to_id.find(sub); - if (it != vocab.token_to_id.end()) { - tokens.push_back(it->second); - i = j; - found = true; - break; - } - --j; - } - if (!found) { - WHISPER_LOG_ERROR("unknown token\n"); - ++i; - } - } + auto word_tokens = bpe_encode(vocab, word); + tokens.insert(tokens.end(), word_tokens.begin(), word_tokens.end()); } return tokens;