diff --git a/UPGRADING b/UPGRADING index 54a2f0e8c33e2..990e4a93cdfc8 100644 --- a/UPGRADING +++ b/UPGRADING @@ -55,6 +55,16 @@ PHP 8.4 UPGRADE NOTES 5. Changed Functions ======================================== +- MBString: + . After performance optimization, mb_strcut sometimes returns slightly different + output for invalid UTF-8 strings. The previous implementation would sometimes + remove invalid UTF-8 bytes just after the cut points, but in other cases, it + would pass them through to the output unchanged. The new implementation + always backs up to the preceding non-continuation byte (a byte which starts + a UTF-8 character) before the starting and ending cut points, if the byte + immediately after the cut point is a continuation byte. (For valid UTF-8 + strings, the output of mb_strcut is unchanged.) + - Standard: . The internal implementation for rounding to integers has been rewritten to be easier to verify for correctness and to be easier to maintain. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index 54744aa4b8ed7..b5ccfb5ab7a7c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -65,6 +65,7 @@ const mbfl_encoding mbfl_encoding_7bit = { &vtbl_wchar_7bit, mb_7bit_to_wchar, mb_wchar_to_7bit, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index b5a732224f003..ebabeeca52848 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -45,6 +45,7 @@ const mbfl_encoding mbfl_encoding_base64 = { NULL, mb_base64_to_wchar, mb_wchar_to_base64, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c index 13635764326f3..b0b771eb4c520 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c @@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = { &vtbl_wchar_jis, mb_iso2022jp_to_wchar, mb_wchar_to_jis, - mb_check_jis + mb_check_jis, + NULL }; static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = { @@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = { &vtbl_wchar_2022jp, mb_iso2022jp_to_wchar, mb_wchar_to_iso2022jp, - mb_check_iso2022jp + mb_check_iso2022jp, + NULL }; static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL}; @@ -4462,6 +4464,7 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { &vtbl_wchar_2022jp_kddi, mb_iso2022jp_kddi_to_wchar, mb_wchar_to_iso2022jp_kddi, + NULL, NULL }; @@ -4496,6 +4499,7 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = { &vtbl_wchar_2022jp_2004, mb_iso2022jp2004_to_wchar, mb_wchar_to_iso2022jp2004, + NULL, NULL }; @@ -4581,6 +4585,7 @@ const mbfl_encoding mbfl_encoding_cp50220 = { &vtbl_wchar_cp50220, mb_cp5022x_to_wchar, mb_wchar_to_cp50220, + NULL, NULL }; @@ -4595,6 +4600,7 @@ const mbfl_encoding mbfl_encoding_cp50221 = { &vtbl_wchar_cp50221, mb_cp5022x_to_wchar, mb_wchar_to_cp50221, + NULL, NULL }; @@ -4609,6 +4615,7 @@ const mbfl_encoding mbfl_encoding_cp50222 = { &vtbl_wchar_cp50222, mb_cp5022x_to_wchar, mb_wchar_to_cp50222, + NULL, NULL }; @@ -4645,6 +4652,7 @@ const mbfl_encoding mbfl_encoding_2022jpms = { &vtbl_wchar_2022jpms, mb_iso2022jpms_to_wchar, mb_wchar_to_iso2022jpms, + NULL, NULL }; @@ -4687,6 +4695,7 @@ const mbfl_encoding mbfl_encoding_2022kr = { &vtbl_wchar_2022kr, mb_iso2022kr_to_wchar, mb_wchar_to_iso2022kr, + NULL, NULL }; @@ -7832,6 +7841,7 @@ const mbfl_encoding mbfl_encoding_sjis = { &vtbl_wchar_sjis, mb_sjis_to_wchar, mb_wchar_to_sjis, + NULL, NULL }; @@ -7868,6 +7878,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { &vtbl_wchar_sjis_mac, mb_sjismac_to_wchar, mb_wchar_to_sjismac, + NULL, NULL }; @@ -7906,6 +7917,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { &vtbl_wchar_sjis_docomo, mb_sjis_docomo_to_wchar, mb_wchar_to_sjis_docomo, + NULL, NULL }; @@ -7940,6 +7952,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { &vtbl_wchar_sjis_kddi, mb_sjis_kddi_to_wchar, mb_wchar_to_sjis_kddi, + NULL, NULL }; @@ -7974,6 +7987,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { &vtbl_wchar_sjis_sb, mb_sjis_sb_to_wchar, mb_wchar_to_sjis_sb, + NULL, NULL }; @@ -8017,6 +8031,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { &vtbl_wchar_sjis2004, mb_sjis2004_to_wchar, mb_wchar_to_sjis2004, + NULL, NULL }; @@ -8103,6 +8118,7 @@ const mbfl_encoding mbfl_encoding_cp932 = { &vtbl_wchar_cp932, mb_cp932_to_wchar, mb_wchar_to_cp932, + NULL, NULL }; @@ -8137,6 +8153,7 @@ const mbfl_encoding mbfl_encoding_sjiswin = { &vtbl_wchar_sjiswin, mb_cp932_to_wchar, mb_wchar_to_sjiswin, + NULL, NULL }; @@ -10346,6 +10363,7 @@ const mbfl_encoding mbfl_encoding_euc_jp = { &vtbl_wchar_eucjp, mb_eucjp_to_wchar, mb_wchar_to_eucjp, + NULL, NULL }; @@ -10382,6 +10400,7 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { &vtbl_wchar_eucjp2004, mb_eucjp2004_to_wchar, mb_wchar_to_eucjp2004, + NULL, NULL }; @@ -10418,6 +10437,7 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { &vtbl_wchar_eucjpwin, mb_eucjpwin_to_wchar, mb_wchar_to_eucjpwin, + NULL, NULL }; @@ -10454,6 +10474,7 @@ const mbfl_encoding mbfl_encoding_cp51932 = { &vtbl_wchar_cp51932, mb_cp51932_to_wchar, mb_wchar_to_cp51932, + NULL, NULL }; @@ -10509,6 +10530,7 @@ const mbfl_encoding mbfl_encoding_euc_cn = { &vtbl_wchar_euccn, mb_euccn_to_wchar, mb_wchar_to_euccn, + NULL, NULL }; @@ -10545,6 +10567,7 @@ const mbfl_encoding mbfl_encoding_euc_tw = { &vtbl_wchar_euctw, mb_euctw_to_wchar, mb_wchar_to_euctw, + NULL, NULL }; @@ -10581,6 +10604,7 @@ const mbfl_encoding mbfl_encoding_euc_kr = { &vtbl_wchar_euckr, mb_euckr_to_wchar, mb_wchar_to_euckr, + NULL, NULL }; @@ -10640,6 +10664,7 @@ const mbfl_encoding mbfl_encoding_uhc = { &vtbl_wchar_uhc, mb_uhc_to_wchar, mb_wchar_to_uhc, + NULL, NULL }; @@ -11555,6 +11580,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = { &vtbl_wchar_gb18030, mb_gb18030_to_wchar, mb_wchar_to_gb18030, + NULL, NULL }; @@ -11591,6 +11617,7 @@ const mbfl_encoding mbfl_encoding_cp936 = { &vtbl_wchar_cp936, mb_cp936_to_wchar, mb_wchar_to_cp936, + NULL, NULL }; @@ -12160,6 +12187,7 @@ const mbfl_encoding mbfl_encoding_big5 = { &vtbl_wchar_big5, mb_big5_to_wchar, mb_wchar_to_big5, + NULL, NULL }; @@ -12194,6 +12222,7 @@ const mbfl_encoding mbfl_encoding_cp950 = { &vtbl_wchar_cp950, mb_cp950_to_wchar, mb_wchar_to_cp950, + NULL, NULL }; @@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = { &vtbl_wchar_hz, mb_hz_to_wchar, mb_wchar_to_hz, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index a75a9c757cb83..e555736c058b6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -67,6 +67,7 @@ const mbfl_encoding mbfl_encoding_html_ent = { &vtbl_wchar_html, mb_htmlent_to_wchar, mb_wchar_to_htmlent, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index c743942d0c5c4..be527052a2245 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -46,6 +46,7 @@ const mbfl_encoding mbfl_encoding_qprint = { NULL, mb_qprint_to_wchar, mb_wchar_to_qprint, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index c5872335a8526..7ced00fa536e1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int &vtbl_wchar_##id, \ mb_##id##_to_wchar, \ mb_wchar_to_##id, \ + NULL, \ NULL \ } diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index e6711d82f8a70..2b50f46a41cf5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs2 = { &vtbl_wchar_ucs2, mb_ucs2_to_wchar, mb_wchar_to_ucs2be, + NULL, NULL }; @@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs2be = { &vtbl_wchar_ucs2be, mb_ucs2be_to_wchar, mb_wchar_to_ucs2be, + NULL, NULL }; @@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs2le = { &vtbl_wchar_ucs2le, mb_ucs2le_to_wchar, mb_wchar_to_ucs2le, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index 1585cb82e3ff9..6b5ed6b609b1e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs4 = { &vtbl_wchar_ucs4, mb_ucs4_to_wchar, mb_wchar_to_ucs4be, + NULL, NULL }; @@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs4be = { &vtbl_wchar_ucs4be, mb_ucs4be_to_wchar, mb_wchar_to_ucs4be, + NULL, NULL }; @@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs4le = { &vtbl_wchar_ucs4le, mb_ucs4le_to_wchar, mb_wchar_to_ucs4le, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index 6e687c941c256..cadb9c15b0669 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -189,6 +189,7 @@ const mbfl_encoding mbfl_encoding_utf16 = { &vtbl_wchar_utf16, mb_utf16_to_wchar, mb_wchar_to_utf16be, + NULL, NULL }; @@ -203,6 +204,7 @@ const mbfl_encoding mbfl_encoding_utf16be = { &vtbl_wchar_utf16be, mb_utf16be_to_wchar, mb_wchar_to_utf16be, + NULL, NULL }; @@ -217,6 +219,7 @@ const mbfl_encoding mbfl_encoding_utf16le = { &vtbl_wchar_utf16le, mb_utf16le_to_wchar, mb_wchar_to_utf16le, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index b49f5df5369e4..e7a309171860a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -50,6 +50,7 @@ const mbfl_encoding mbfl_encoding_utf32 = { &vtbl_wchar_utf32, mb_utf32_to_wchar, mb_wchar_to_utf32be, + NULL, NULL }; @@ -64,6 +65,7 @@ const mbfl_encoding mbfl_encoding_utf32be = { &vtbl_wchar_utf32be, mb_utf32be_to_wchar, mb_wchar_to_utf32be, + NULL, NULL }; @@ -78,6 +80,7 @@ const mbfl_encoding mbfl_encoding_utf32le = { &vtbl_wchar_utf32le, mb_utf32le_to_wchar, mb_wchar_to_utf32le, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index af84602ae1880..750d882853361 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -62,7 +62,8 @@ const mbfl_encoding mbfl_encoding_utf7 = { &vtbl_wchar_utf7, mb_utf7_to_wchar, mb_wchar_to_utf7, - mb_check_utf7 + mb_check_utf7, + NULL }; const struct mbfl_convert_vtbl vtbl_utf7_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index d8af71686a1f3..176f8c578ec2f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -98,7 +98,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { &vtbl_wchar_utf7imap, mb_utf7imap_to_wchar, mb_wchar_to_utf7imap, - mb_check_utf7imap + mb_check_utf7imap, + NULL }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 92d7c38930981..d079c2a73b62d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -51,6 +51,7 @@ const unsigned char mblen_table_utf8[] = { static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end); static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL}; @@ -65,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf8 = { &vtbl_wchar_utf8, mb_utf8_to_wchar, mb_wchar_to_utf8, - NULL + NULL, + mb_cut_utf8 }; const struct mbfl_convert_vtbl vtbl_utf8_wchar = { @@ -335,3 +337,21 @@ static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end) +{ + unsigned char *start = str + from; + /* Byte values less than -64 are UTF-8 continuation bytes, that is, + * the 2nd, 3rd, or 4th byte of a multi-byte character */ + while (start > str && ((signed char)*start) < -64) { + start--; + } + unsigned char *_end = start + len; + if (_end >= end) { + return zend_string_init_fast((char*)start, end - start); + } + while (_end > start && ((signed char)*_end) < -64) { + _end--; + } + return zend_string_init_fast((char*)start, _end - start); +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index dd253cfe689fc..44cf859db18e1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -124,6 +124,7 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { &vtbl_wchar_utf8_docomo, mb_utf8_docomo_to_wchar, mb_wchar_to_utf8_docomo, + NULL, NULL }; @@ -138,6 +139,7 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { &vtbl_wchar_utf8_kddi_a, mb_utf8_kddi_a_to_wchar, mb_wchar_to_utf8_kddi_a, + NULL, NULL }; @@ -152,6 +154,7 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { &vtbl_wchar_utf8_kddi_b, mb_utf8_kddi_b_to_wchar, mb_wchar_to_utf8_kddi_b, + NULL, NULL }; @@ -166,6 +169,7 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { &vtbl_wchar_utf8_sb, mb_utf8_sb_to_wchar, mb_wchar_to_utf8_sb, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index 83a56977d3e0e..68f3862b49867 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_uuencode = { NULL, mb_uuencode_to_wchar, mb_wchar_to_uuencode, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 43db2f7f5b20b..2b933f2dea52c 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -52,6 +52,7 @@ const mbfl_encoding mbfl_encoding_8bit = { &vtbl_wchar_8bit, mb_8bit_to_wchar, mb_wchar_to_8bit, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c index b932603e1c5f4..2077c8713271f 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c @@ -45,6 +45,7 @@ const mbfl_encoding mbfl_encoding_pass = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c index 93a8d91e7a552..6ea1676290fda 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c @@ -43,5 +43,6 @@ const mbfl_encoding mbfl_encoding_wchar = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index b25ec71eef9de..eee913c600cdd 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -145,6 +145,7 @@ typedef struct { typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state); typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end); typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len); +typedef zend_string* (*mb_cut_fn)(unsigned char *str, size_t from, size_t len, unsigned char *end); /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`, * the buffer must be at least this size (to work with all supported text encodings) */ @@ -251,6 +252,7 @@ typedef struct { mb_to_wchar_fn to_wchar; mb_from_wchar_fn from_wchar; mb_check_fn check; + mb_cut_fn cut; } mbfl_encoding; extern const mbfl_encoding mbfl_encoding_utf8; diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3e59806b86750..d19325459ea63 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2403,19 +2403,20 @@ PHP_FUNCTION(mb_strcut) Z_PARAM_STR_OR_NULL(encoding) ZEND_PARSE_PARAMETERS_END(); - string.val = (unsigned char*)string_val; - string.encoding = php_mb_get_encoding(encoding, 4); - if (!string.encoding) { + const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4); + if (!enc) { RETURN_THROWS(); } + string.val = (unsigned char*)string_val; + string.encoding = enc; + if (len_is_null) { len = string.len; } /* if "from" position is negative, count start position from the end - * of the string - */ + * of the string */ if (from < 0) { from = string.len + from; if (from < 0) { @@ -2424,8 +2425,7 @@ PHP_FUNCTION(mb_strcut) } /* if "length" position is negative, set it to the length - * needed to stop that many chars from the end of the string - */ + * needed to stop that many chars from the end of the string */ if (len < 0) { len = (string.len - from) + len; if (len < 0) { @@ -2437,12 +2437,14 @@ PHP_FUNCTION(mb_strcut) RETURN_EMPTY_STRING(); } - ret = mbfl_strcut(&string, &result, from, len); - ZEND_ASSERT(ret != NULL); - - // TODO: avoid reallocation ??? - RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */ - efree(ret->val); + if (enc->cut) { + RETURN_STR(enc->cut(string.val, from, len, string.val + string.len)); + } else { + ret = mbfl_strcut(&string, &result, from, len); + ZEND_ASSERT(ret != NULL); + RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */ + efree(ret->val); + } } /* }}} */ diff --git a/ext/mbstring/tests/bug49354.phpt b/ext/mbstring/tests/bug49354.phpt index 5c5af8b3fd193..710eff639b088 100644 --- a/ext/mbstring/tests/bug49354.phpt +++ b/ext/mbstring/tests/bug49354.phpt @@ -4,11 +4,11 @@ Bug #49354 (mb_strcut() cuts wrong length when offset is in the middle of a mult mbstring --FILE-- 1b2442 3441 3b7a 1b2842 20 61 62 63 20 1b2442 252b 254a 1b2842] +$jis = mb_convert_encoding("漢字 abc カナ", 'JIS', 'UTF-8'); +// For testing ISO-2022-JP-2004, add a Kanji character which is in JISX 0213 +$iso2022jp2004 = mb_convert_encoding("漢字 abc カナ凜", 'ISO-2022-JP-2004', 'UTF-8'); // [1b242851 3441 3b7a 1b2842 20 61 62 63 20 1b242851 252b 254a 7425 1b2842] +$iso2022jpms = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-MS', 'UTF-8'); // [1b2442 3441 3b7a 1b2842 20 61 62 63 20 1b2442 252b 254a 1b2842] +$iso2022jp_kddi = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-KDDI', 'UTF-8'); print "== EUC-JP ==\n"; -print MBStringChars(mb_strcut($euc_jp, 6, 5,'EUC-JP'), 'EUC-JP') . "\n"; -print MBStringChars(mb_strcut($euc_jp, 5, 5,'EUC-JP'), 'EUC-JP') . "\n"; -print MBStringChars(mb_strcut($euc_jp, 0, 100,'EUC-JP'), 'EUC-JP') . "\n"; +print MBStringChars(mb_strcut($euc_jp, 6, 5, 'EUC-JP'), 'EUC-JP') . "\n"; +print MBStringChars(mb_strcut($euc_jp, 5, 5, 'EUC-JP'), 'EUC-JP') . "\n"; +print MBStringChars(mb_strcut($euc_jp, 0, 100, 'EUC-JP'), 'EUC-JP') . "\n"; -$str = mb_strcut($euc_jp, 100, 10,'EUC-JP'); +$str = mb_strcut($euc_jp, 100, 10, 'EUC-JP'); ($str === "") ? print "OK\n" : print "No good\n"; -$str = mb_strcut($euc_jp, -100, 10,'EUC-JP'); +$str = mb_strcut($euc_jp, -100, 10, 'EUC-JP'); ($str !== "") ? print "OK\n" : print "No good\n"; print "== UTF-8 ==\n"; @@ -45,6 +49,17 @@ print MBStringChars(mb_strcut($utf8, 1, 2, 'UTF-8'), 'UTF-8') . "\n"; print MBStringChars(mb_strcut($utf8, 1, 3, 'UTF-8'), 'UTF-8') . "\n"; print MBStringChars(mb_strcut($utf8, 1, 4, 'UTF-8'), 'UTF-8') . "\n"; +print MBStringChars(mb_strcut('AåBäCöDü', 2, 100, 'UTF-8'), 'UTF-8') . "\n"; + +print "== UTF-16 ==\n"; +print "Single byte: [" . bin2hex(mb_strcut("\xFF", 0, 100, 'UTF-16')) . "]\n"; +print "With from=1: [" . bin2hex(mb_strcut("\xff\x01", 1, 100, "UTF-16")) . "]\n"; +print "Bad surrogate: [" . bin2hex(mb_strcut("\xD9\xFF", 0, 100, "UTF-16")) . "]\n"; +print "Bad surrogate followed by other bytes: [" . bin2hex(mb_strcut("\xd9\x00\x12C", 0, 100, "UTF-16")) . "]\n"; +print "BE byte order mark: [" . bin2hex(mb_strcut("\xFE\xFF", 0, 100, "UTF-16")) . "]\n"; +print "LE byte order mark: [" . bin2hex(mb_strcut("\xFF\xFE", 0, 100, "UTF-16")) . "]\n"; +print "Length=0: [" . bin2hex(mb_strcut("\x00\x01\x00\x00", 1, -512, "UTF-16")) . "]\n"; + print "== UTF-16LE ==\n"; print MBStringChars(mb_strcut($utf16le, 0, 0, 'UTF-16LE'), 'UTF-16LE') . "\n"; print MBStringChars(mb_strcut($utf16le, 0, 1, 'UTF-16LE'), 'UTF-16LE') . "\n"; @@ -54,6 +69,162 @@ print MBStringChars(mb_strcut($utf16le, 1, 2, 'UTF-16LE'), 'UTF-16LE') . "\n"; print MBStringChars(mb_strcut($utf16le, 1, 3, 'UTF-16LE'), 'UTF-16LE') . "\n"; print MBStringChars(mb_strcut($utf16le, 1, 4, 'UTF-16LE'), 'UTF-16LE') . "\n"; +print "Single byte: [" . bin2hex(mb_strcut("\xFF", 0, 100, 'UTF-16LE')) . "]\n"; + +print "== UTF-32BE ==\n"; +print MBStringChars(mb_strcut($utf32be, 0, 3, 'UTF-32BE'), 'UTF-32BE') . "\n"; +print MBStringChars(mb_strcut($utf32be, 0, 4, 'UTF-32BE'), 'UTF-32BE') . "\n"; +print MBStringChars(mb_strcut($utf32be, 0, 5, 'UTF-32BE'), 'UTF-32BE') . "\n"; +print MBStringChars(mb_strcut($utf32be, 1, 8, 'UTF-32BE'), 'UTF-32BE') . "\n"; +print MBStringChars(mb_strcut($utf32be, 3, 9, 'UTF-32BE'), 'UTF-32BE') . "\n"; + +print "== ISO-2022-JP ==\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 3, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 4, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 5, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 6, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 7, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 0, 8, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp, 1, 3, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 1, 6, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 1, 8, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp, 2, 5, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 5, 9, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 5, 11, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 6, 13, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 7, 13, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp, 1, 100, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; +print MBStringChars(mb_strcut($iso2022jp, 50, 100, 'ISO-2022-JP'), 'ISO-2022-JP') . "\n"; + +print "Error followed by ASCII char: [" . bin2hex(mb_strcut("\xdaK", 0, 100, "ISO-2022-JP")) . "]\n"; + +print "== ISO-2022-JP-2004 ==\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 3, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 4, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 5, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 6, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 7, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 8, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 0, 9, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp2004, 1, 3, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 1, 6, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 1, 8, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 1, 9, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp2004, 2, 5, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 5, 9, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 5, 11, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 6, 13, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 7, 13, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp2004, 1, 100, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; +print MBStringChars(mb_strcut($iso2022jp2004, 50, 100, 'ISO-2022-JP-2004'), 'ISO-2022-JP-2004') . "\n"; + +print "== ISO-2022-JP-MS ==\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 3, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 4, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 5, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 6, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 7, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 8, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 0, 9, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; + +print MBStringChars(mb_strcut($iso2022jpms, 1, 3, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 1, 6, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 1, 8, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 1, 9, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; + +print MBStringChars(mb_strcut($iso2022jpms, 2, 5, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 5, 9, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 5, 11, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 6, 13, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 7, 13, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; + +print MBStringChars(mb_strcut($iso2022jpms, 1, 100, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; +print MBStringChars(mb_strcut($iso2022jpms, 50, 100, 'ISO-2022-JP-MS'), 'ISO-2022-JP-MS') . "\n"; + +print "== JIS ==\n"; +print MBStringChars(mb_strcut($jis, 0, 3, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 0, 4, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 0, 5, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 0, 6, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 0, 7, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 0, 8, 'JIS'), 'JIS') . "\n"; + +print MBStringChars(mb_strcut($jis, 1, 3, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 1, 6, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 1, 8, 'JIS'), 'JIS') . "\n"; + +print MBStringChars(mb_strcut($jis, 2, 5, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 5, 9, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 5, 11, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 6, 13, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 7, 13, 'JIS'), 'JIS') . "\n"; + +print MBStringChars(mb_strcut($jis, 1, 100, 'JIS'), 'JIS') . "\n"; +print MBStringChars(mb_strcut($jis, 50, 100, 'JIS'), 'JIS') . "\n"; + +print "0xA3: [" . bin2hex(mb_strcut("\xA3aaaaaa", 0, 100, 'JIS')) . "]\n"; +print "Bad escape sequence followed by null byte: [" . bin2hex(mb_strcut("\x1b\x00", 1, 100, "JIS")) . "]\n"; + +print "== ISO-2022-JP-KDDI ==\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 3, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 4, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 5, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 6, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 7, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 0, 8, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp_kddi, 1, 3, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 1, 6, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 1, 8, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp_kddi, 2, 5, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 5, 9, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 5, 11, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 6, 13, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 7, 13, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; + +print MBStringChars(mb_strcut($iso2022jp_kddi, 1, 100, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; +print MBStringChars(mb_strcut($iso2022jp_kddi, 50, 100, 'ISO-2022-JP-KDDI'), 'ISO-2022-JP-KDDI') . "\n"; + +print "== CP50220 ==\n"; + +print "Single byte 0xFF: [" . bin2hex(mb_strcut("\xFF", 0, 100, 'CP50220')) . "]\n"; +print "Double byte 0xFF: [" . bin2hex(mb_strcut("\xFF\xFF", 0, 100, 'CP50220')) . "]\n"; +print "Sample string with multiple null bytes: [" . bin2hex(mb_strcut("\xCF\x00\x00\x00\x00\x00d\x00\x00", 0, 100, 'CP50220')) . "]\n"; +print "Bad escape sequence preceded by bad bytes: [" . bin2hex(mb_strcut("\xFF\xFF\x1B\x00", 0, 100, 'CP50220')) . "]\n"; +print "Good JISX 0208 sequence, but it won't fit in max number of bytes: [" . bin2hex(mb_strcut("\x1B\$BGV\x17", 0, 100, 'CP50220')) . "]\n"; +print "Bad escape sequence followed by GR kana: [" . bin2hex(mb_strcut("\x1B\$\xAC\x13", 0, 100, 'CP50220')) . "]\n"; + +print "== UTF-7 ==\n"; + +print "Single byte 0x01: [" . mb_strcut("\x01", 0, 100, 'UTF-7') . "]\n"; +print "UTF-16 section ends abruptly: [" . mb_strcut("+Q", 1, 100, 'UTF-7') . "]\n"; +print "UTF-16 section ends abruptly in middle of 2nd codepoint: [" . mb_strcut("+QxxC", 0, 100, 'UTF-7') . "]\n"; +print "Cutting in middle of UTF-16 section: [" . mb_strcut("+UUU", -1, 255, "UTF-7") . "]\n"; +print "Cutting in middle of UTF-16 section (2): [" . mb_strcut("+UUUU", -2, 255, "UTF-7") . "]\n"; + +print "== UTF7-IMAP ==\n"; + +print "Single byte 0x01: [" . mb_strcut("\x01", 0, 100, 'UTF7-IMAP') . "]\n"; +print "UTF-16 section ends abruptly: [" . mb_strcut("&Q", 1, 100, 'UTF7-IMAP') . "]\n"; +print "UTF-16 section ends abruptly in middle of 2nd codepoint: [" . mb_strcut("&QxxC", 0, 100, 'UTF7-IMAP') . "]\n"; +print "UTF-16 section is terminated improperly: [" . mb_strcut("&i6o\x83", 0, 100, 'UTF7-IMAP') . "]\n"; + +print "== GB18030 ==\n"; + +print "Invalid byte 0xF5: [" . bin2hex(mb_strcut("\xF5a", 1, 100, 'GB18030')) . "]\n"; +print "Double-byte char: [" . bin2hex(mb_strcut("\xAFw", -1, 100, "GB18030")) . "]\n"; + +print "== UHC ==\n"; + +print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n"; + ?> --EXPECT-- == EUC-JP == @@ -72,6 +243,15 @@ OK [] [e288ae] [e288ae 20] +[c3a5 42 c3a4 43 c3b6 44 c3bc] +== UTF-16 == +Single byte: [] +With from=1: [] +Bad surrogate: [] +Bad surrogate followed by other bytes: [003f1243] +BE byte order mark: [] +LE byte order mark: [] +Length=0: [] == UTF-16LE == [] [] @@ -80,3 +260,125 @@ OK [1a04] [1a04] [1a04 3804] +Single byte: [] +== UTF-32BE == +[] +[0000222e] +[0000222e] +[0000222e 00000020] +[0000222e 00000020] +== ISO-2022-JP == +[] +[] +[] +[] +[] +[1b244234411b2842] +[] +[] +[1b244234411b2842] +[] +[1b24423b7a1b2842 20] +[1b24423b7a1b2842 20 61 62] +[1b24423b7a1b2842 20 61 62 63 20] +[20 61 62 63 20 1b2442252b1b2842] +[1b244234411b2842 1b24423b7a1b2842 20 61 62 63 20 1b2442252b1b2842] +[] +Error followed by ASCII char: [4b] +== ISO-2022-JP-2004 == +[] +[] +[] +[] +[] +[] +[1b24285134411b2842] +[] +[] +[] +[1b24285134411b2842] +[] +[1b24285134411b2842] +[1b24285134411b2842 1b2428513b7a1b2842] +[1b2428513b7a1b2842 20 61 62 63] +[1b2428513b7a1b2842 20 61 62 63] +[1b24285134411b2842 1b2428513b7a1b2842 20 61 62 63 20 1b242851252b1b2842 1b242851254a1b2842] +[] +== ISO-2022-JP-MS == +[] +[] +[] +[] +[] +[1b244234411b2842] +[1b244234411b2842] +[] +[] +[1b244234411b2842] +[1b244234411b2842] +[] +[1b24423b7a1b2842 20] +[1b24423b7a1b2842 20 61 62] +[1b24423b7a1b2842 20 61 62 63 20] +[20 61 62 63 20 1b2442252b1b2842] +[1b244234411b2842 1b24423b7a1b2842 20 61 62 63 20 1b2442252b1b2842] +[] +== JIS == +[] +[] +[] +[] +[] +[1b244234411b2842] +[] +[] +[1b244234411b2842] +[] +[1b24423b7a1b2842 20] +[1b24423b7a1b2842 20 61 62] +[1b24423b7a1b2842 20 61 62 63 20] +[20 61 62 63 20 1b2442252b1b2842] +[1b244234411b2842 1b24423b7a1b2842 20 61 62 63 20 1b2442252b1b2842] +[] +0xA3: [] +Bad escape sequence followed by null byte: [] +== ISO-2022-JP-KDDI == +[] +[] +[] +[] +[] +[1b244234411b2842] +[] +[] +[1b244234411b2842] +[] +[1b24423b7a1b2842 20] +[1b24423b7a1b2842 20 61 62] +[1b24423b7a1b2842 20 61 62 63 20] +[20 61 62 63 20 1b2442252b1b2842] +[1b244234411b2842 1b24423b7a1b2842 20 61 62 63 20 1b2442252b1b2842] +[] +== CP50220 == +Single byte 0xFF: [] +Double byte 0xFF: [3f] +Sample string with multiple null bytes: [1b2442255e001b2842] +Bad escape sequence preceded by bad bytes: [3f3f3f00] +Good JISX 0208 sequence, but it won't fit in max number of bytes: [] +Bad escape sequence followed by GR kana: [] +== UTF-7 == +Single byte 0x01: [] +UTF-16 section ends abruptly: [] +UTF-16 section ends abruptly in middle of 2nd codepoint: [+Qxw-] +Cutting in middle of UTF-16 section: [] +Cutting in middle of UTF-16 section (2): [] +== UTF7-IMAP == +Single byte 0x01: [?] +UTF-16 section ends abruptly: [] +UTF-16 section ends abruptly in middle of 2nd codepoint: [] +UTF-16 section is terminated improperly: [] +== GB18030 == +Invalid byte 0xF5: [] +Double-byte char: [] +== UHC == +Single byte 0x96: [96]